From 7c06b9c3efce7297d29ab41ee6bc16a3c332ff55 Mon Sep 17 00:00:00 2001
From: Trent Nelson <trent@trent.me>
Date: Sun, 29 Mar 2026 21:00:33 -0700
Subject: [PATCH 01/14] Chm02 CUDA: advance single-graph bring-up

---
 src/PerfectHash/Graph.c      |  62 ++++++++++
 src/PerfectHash/GraphCu.c    | 171 +++++++++++++++++++++------
 src/PerfectHashCuda/Graph.cu | 221 +++++++++++++++++------------------
 3 files changed, 300 insertions(+), 154 deletions(-)

diff --git a/src/PerfectHash/Graph.c b/src/PerfectHash/Graph.c
index bc2deaba..9bb8d30b 100644
--- a/src/PerfectHash/Graph.c
+++ b/src/PerfectHash/Graph.c
@@ -544,6 +544,7 @@ Return Value:
     PPERFECT_HASH_CONTEXT Context;
     PASSIGNED_MEMORY_COVERAGE Coverage;
     PASSIGNED16_MEMORY_COVERAGE Coverage16;
+    BOOLEAN DebugCudaSolve;
 
     //
     // Initialize aliases.
@@ -554,6 +555,13 @@ Return Value:
     Table = Context->Table;
     NumberOfKeys = Table->Keys->NumberOfKeys.LowPart;
     Edges = Keys = (PKEY)Table->Keys->KeyArrayBaseAddress;
+    DebugCudaSolve = FALSE;
+
+#ifdef PH_WINDOWS
+    DebugCudaSolve = (GetEnvironmentVariableA("PH_DEBUG_CUDA_CHM02", NULL, 0) > 0);
+#else
+    DebugCudaSolve = (getenv("PH_DEBUG_CUDA_CHM02") != NULL);
+#endif
 
     //
     // Attempt to add all the keys to the graph.
@@ -561,6 +569,12 @@ Return Value:
 
     Result = Graph->Vtbl->AddKeys(Graph, NumberOfKeys, Keys);
 
+    if (DebugCudaSolve && IsCuGraph(Graph)) {
+        fprintf(stderr,
+                "[GraphSolve] AddKeys result=0x%08x\n",
+                (unsigned)Result);
+    }
+
     if (FAILED(Result)) {
 
         //
@@ -594,6 +608,14 @@ Return Value:
     //
 
     Result = Graph->Vtbl->IsAcyclic(Graph);
+    if (DebugCudaSolve && IsCuGraph(Graph)) {
+        fprintf(stderr,
+                "[GraphSolve] IsAcyclic result=0x%08x DeletedEdgeCount=%u "
+                "OrderIndex=%ld\n",
+                (unsigned)Result,
+                (unsigned)Graph->DeletedEdgeCount,
+                (long)Graph->OrderIndex);
+    }
     if (FAILED(Result)) {
 
         //
@@ -643,6 +665,12 @@ Return Value:
 
     Result = Graph->Vtbl->Assign(Graph);
 
+    if (DebugCudaSolve && IsCuGraph(Graph)) {
+        fprintf(stderr,
+                "[GraphSolve] Assign result=0x%08x\n",
+                (unsigned)Result);
+    }
+
     //
     // Assign() should always succeed.
     //
@@ -3974,6 +4002,7 @@ Return Value:
 {
     PGRAPH NewGraph;
     HRESULT Result = S_OK;
+    BOOLEAN DebugCudaSolve;
 
     //
     // Validate arguments.
@@ -3983,6 +4012,13 @@ Return Value:
         return E_POINTER;
     }
 
+    DebugCudaSolve = FALSE;
+#ifdef PH_WINDOWS
+    DebugCudaSolve = (GetEnvironmentVariableA("PH_DEBUG_CUDA_CHM02", NULL, 0) > 0);
+#else
+    DebugCudaSolve = (getenv("PH_DEBUG_CUDA_CHM02") != NULL);
+#endif
+
     //
     // Acquire the exclusive graph lock for the duration of the routine.  The
     // graph should never be locked at this point; if it is, consider it a
@@ -4001,6 +4037,12 @@ Return Value:
 
     Result = Graph->Vtbl->LoadInfo(Graph);
 
+    if (DebugCudaSolve && IsCuGraph(Graph)) {
+        fprintf(stderr,
+                "[GraphEnterSolvingLoop] LoadInfo result=0x%08x\n",
+                (unsigned)Result);
+    }
+
     if (FAILED(Result)) {
 
         if (Result != E_OUTOFMEMORY) {
@@ -4040,6 +4082,11 @@ Return Value:
     while (GraphShouldWeContinueTryingToSolve(Graph)) {
 
         Result = Graph->Vtbl->Reset(Graph);
+        if (DebugCudaSolve && IsCuGraph(Graph)) {
+            fprintf(stderr,
+                    "[GraphEnterSolvingLoop] Reset result=0x%08x\n",
+                    (unsigned)Result);
+        }
         if (FAILED(Result)) {
             PH_ERROR(GraphReset, Result);
             break;
@@ -4048,6 +4095,16 @@ Return Value:
         }
 
         Result = Graph->Vtbl->LoadNewSeeds(Graph);
+        if (DebugCudaSolve && IsCuGraph(Graph)) {
+            fprintf(stderr,
+                    "[GraphEnterSolvingLoop] LoadNewSeeds result=0x%08x "
+                    "Seed1=%u Seed2=%u Seed3=%u Seed4=%u\n",
+                    (unsigned)Result,
+                    (unsigned)Graph->Seed1,
+                    (unsigned)Graph->Seed2,
+                    (unsigned)Graph->Seed3,
+                    (unsigned)Graph->Seed4);
+        }
         if (FAILED(Result)) {
 
             //
@@ -4061,6 +4118,11 @@ Return Value:
 
         NewGraph = NULL;
         Result = Graph->Vtbl->Solve(Graph, &NewGraph);
+        if (DebugCudaSolve && IsCuGraph(Graph)) {
+            fprintf(stderr,
+                    "[GraphEnterSolvingLoop] Solve result=0x%08x\n",
+                    (unsigned)Result);
+        }
         if (FAILED(Result)) {
             PH_ERROR(GraphSolve, Result);
             break;
diff --git a/src/PerfectHash/GraphCu.c b/src/PerfectHash/GraphCu.c
index bcf041e8..4ec65ab2 100644
--- a/src/PerfectHash/GraphCu.c
+++ b/src/PerfectHash/GraphCu.c
@@ -1317,6 +1317,19 @@ GraphCuLoadNewSeeds(
     return Result;
 }
 
+FORCEINLINE
+BOOLEAN
+IsCudaDebugEnabled(
+    VOID
+    )
+{
+#ifdef PH_WINDOWS
+    return (GetEnvironmentVariableA("PH_DEBUG_CUDA_CHM02", NULL, 0) > 0);
+#else
+    return (getenv("PH_DEBUG_CUDA_CHM02") != NULL);
+#endif
+}
+
 HRESULT
 GraphCuAddKeys(
     _In_ PGRAPH Graph,
@@ -1325,6 +1338,7 @@ GraphCuAddKeys(
     )
 {
     PCU Cu;
+    HRESULT Result;
 
     //
     // Keys have already been prepared on the GPU, so we don't need to use
@@ -1336,10 +1350,22 @@ GraphCuAddKeys(
 
     Cu = Graph->CuSolveContext->DeviceContext->Cu;
 
-    return Cu->AddKeys(Graph,
-                       Graph->CuBlocksPerGrid,
-                       Graph->CuThreadsPerBlock,
-                       Graph->CuSharedMemory);
+    Result = Cu->AddKeys(Graph,
+                         Graph->CuBlocksPerGrid,
+                         Graph->CuThreadsPerBlock,
+                         Graph->CuSharedMemory);
+
+    if (IsCudaDebugEnabled()) {
+        fprintf(stderr,
+                "[GraphCuAddKeys] Result=0x%08x HashKeysResult=0x%08x "
+                "VertexFailures=%u WarpFailures=%u\n",
+                (unsigned)Result,
+                (unsigned)Graph->CuHashKeysResult,
+                (unsigned)Graph->CuVertexCollisionFailures,
+                (unsigned)Graph->CuWarpVertexCollisionFailures);
+    }
+
+    return Result;
 }
 
 HRESULT
@@ -1372,6 +1398,16 @@ GraphCuIsAcyclic(
                            Graph->CuThreadsPerBlock,
                            Graph->CuSharedMemory);
 
+    if (IsCudaDebugEnabled()) {
+        fprintf(stderr,
+                "[GraphCuIsAcyclic] GpuResult=0x%08x Attempts=%u "
+                "DeletedEdges=%u OrderIndex=%ld\n",
+                (unsigned)Result,
+                (unsigned)Graph->CuIsAcyclicPhase1Attempts,
+                (unsigned)Graph->DeletedEdgeCount,
+                (long)Graph->OrderIndex);
+    }
+
     //
     // If we weren't acyclic, return.
     //
@@ -1438,57 +1474,107 @@ GraphCuIsAcyclic(
     ASSERT(Graph->CpuGraph != NULL);
     ASSERT(Graph->Impl == 3);
 
-    if (IsUsingAssigned16(Graph)) {
-        Result = Graph->CpuGraph->Vtbl->AddKeys(Graph->CpuGraph,
-                                                NumberOfKeys,
-                                                Keys);
-        if (FAILED(Result)) {
-            InterlockedIncrement64(
-                &Context->GpuAddKeysSuccessButCpuAddKeysFailures);
-            return Result;
-        } else {
-            InterlockedIncrement64(&Context->GpuAndCpuAddKeysSuccess);
+    Result = Graph->CpuGraph->Vtbl->AddKeys(Graph->CpuGraph,
+                                            NumberOfKeys,
+                                            Keys);
+    if (FAILED(Result)) {
+        InterlockedIncrement64(
+            &Context->GpuAddKeysSuccessButCpuAddKeysFailures);
+        return Result;
+    } else {
+        InterlockedIncrement64(&Context->GpuAndCpuAddKeysSuccess);
+    }
+
+    if (IsCudaDebugEnabled()) {
+        HRESULT CpuAcyclicResult;
+        ULONG MismatchIndex;
+        ULONG CpuOrder;
+        ULONG GpuOrder;
+
+        CpuAcyclicResult = Graph->CpuGraph->Vtbl->IsAcyclic(Graph->CpuGraph);
+        fprintf(stderr,
+                "[GraphCuIsAcyclic] CpuAcyclicOracleResult=0x%08x\n",
+                (unsigned)CpuAcyclicResult);
+
+        if (SUCCEEDED(CpuAcyclicResult)) {
+            MismatchIndex = (ULONG)-1;
+
+            if (IsUsingAssigned16(Graph)) {
+                for (ULONG Index = 0; Index < NumberOfKeys; Index++) {
+                    CpuOrder = ((PUSHORT)Graph->CpuGraph->Order16)[Index];
+                    GpuOrder = ((PUSHORT)Graph->Order16)[Index];
+                    if (CpuOrder != GpuOrder) {
+                        MismatchIndex = Index;
+                        fprintf(stderr,
+                                "[GraphCuIsAcyclic] OrderMismatch16 index=%u "
+                                "gpu=%u cpu=%u\n",
+                                (unsigned)Index,
+                                (unsigned)GpuOrder,
+                                (unsigned)CpuOrder);
+                        break;
+                    }
+                }
+            } else {
+                for (ULONG Index = 0; Index < NumberOfKeys; Index++) {
+                    CpuOrder = ((PULONG)Graph->CpuGraph->Order)[Index];
+                    GpuOrder = ((PULONG)Graph->Order)[Index];
+                    if (CpuOrder != GpuOrder) {
+                        MismatchIndex = Index;
+                        fprintf(stderr,
+                                "[GraphCuIsAcyclic] OrderMismatch index=%u "
+                                "gpu=%u cpu=%u\n",
+                                (unsigned)Index,
+                                (unsigned)GpuOrder,
+                                (unsigned)CpuOrder);
+                        break;
+                    }
+                }
+            }
+
+            if (MismatchIndex == (ULONG)-1) {
+                fprintf(stderr,
+                        "[GraphCuIsAcyclic] Order arrays match CPU oracle.\n");
+            }
         }
 
-        Result = Graph->CpuGraph->Vtbl->IsAcyclic(Graph->CpuGraph);
+        Result = Graph->CpuGraph->Vtbl->Reset(Graph->CpuGraph);
         if (FAILED(Result)) {
-            InterlockedIncrement64(
-                &Context->GpuIsAcyclicButCpuIsCyclicFailures);
             return Result;
-        } else {
-            InterlockedIncrement64(&Context->GpuAndCpuIsAcyclicSuccess);
         }
-    } else {
+
         Result = Graph->CpuGraph->Vtbl->AddKeys(Graph->CpuGraph,
                                                 NumberOfKeys,
                                                 Keys);
         if (FAILED(Result)) {
-            InterlockedIncrement64(
-                &Context->GpuAddKeysSuccessButCpuAddKeysFailures);
             return Result;
-        } else {
-            InterlockedIncrement64(&Context->GpuAndCpuAddKeysSuccess);
-        }
-
-        Result = Graph->CpuGraph->Vtbl->IsAcyclic(Graph->CpuGraph);
-        if (FAILED(Result)) {
-            InterlockedIncrement64(
-                &Context->GpuIsAcyclicButCpuIsCyclicFailures);
-            return Result;
-        } else {
-            InterlockedIncrement64(&Context->GpuAndCpuIsAcyclicSuccess);
         }
     }
 
     //
-    // Copy the Order[] array from the CPU graph.
+    // The GPU path now owns the peel order.  Feed the captured Order[] into the
+    // CPU graph so Assign()/Verify() can act as an oracle during bring-up
+    // without recomputing IsAcyclic() on the CPU.
     //
 
-    ASSERT(SUCCEEDED(Result));
-    CopyMemory(Graph->Order,
-               Graph->CpuGraph->Order,
+    CopyMemory(Graph->CpuGraph->Order,
+               Graph->Order,
                Info->OrderSizeInBytes);
 
+    Graph->Flags.IsAcyclic = TRUE;
+    Graph->DeletedEdgeCount = NumberOfKeys;
+    Graph->OrderIndex = 0;
+
+    Graph->CpuGraph->Flags.IsAcyclic = TRUE;
+    Graph->CpuGraph->DeletedEdgeCount = NumberOfKeys;
+
+    if (IsUsingAssigned16(Graph)) {
+        Graph->CpuGraph->Order16Index = 0;
+    } else {
+        Graph->CpuGraph->OrderIndex = 0;
+    }
+
+    InterlockedIncrement64(&Context->GpuAndCpuIsAcyclicSuccess);
+
     return Result;
 }
 
@@ -1500,7 +1586,18 @@ GraphCuAssign(
     PRTL Rtl;
     HRESULT Result;
 
+    if (IsCudaDebugEnabled()) {
+        fprintf(stderr,
+                "[GraphCuAssign] Enter IsAcyclic=%u OrderIndex=%ld CpuOrderIndex=%ld\n",
+                (unsigned)Graph->Flags.IsAcyclic,
+                (long)Graph->OrderIndex,
+                (long)Graph->CpuGraph->OrderIndex);
+    }
+
     Result = Graph->CpuGraph->Vtbl->Assign(Graph->CpuGraph);
+    if (IsCudaDebugEnabled()) {
+        fprintf(stderr, "[GraphCuAssign] CpuAssignResult=0x%08x\n", (unsigned)Result);
+    }
     if (FAILED(Result)) {
         return Result;
     }
diff --git a/src/PerfectHashCuda/Graph.cu b/src/PerfectHashCuda/Graph.cu
index 62709ff3..9dfbe758 100644
--- a/src/PerfectHashCuda/Graph.cu
+++ b/src/PerfectHashCuda/Graph.cu
@@ -205,7 +205,6 @@ GraphCuAddEdge1(
             PrevVertex3 = *Vertex;
 
             NextDegree = PrevVertex3.Degree + 1;
-            NextDegree += 1;
 
             NextEdges = PrevVertex3.Edges;
             NextEdges ^= Edge;
@@ -808,9 +807,8 @@ GraphCuRemoveVertex(
     //DegreeType Degree;
     Edge3Type Edge3;
     Edge3Type *Edges3 = (decltype(Edges3))Graph->Edges3;
-    //OrderType *Order = (decltype(Order))Graph->Order;
-    //OrderType *OrderAddress;
-    //OrderIndexType OrderIndex;
+    OrderType *Order = (decltype(Order))Graph->Order;
+    OrderIndexType OrderIndex;
     OrderIndexType *GraphOrderIndex =
         (decltype(GraphOrderIndex))&Graph->OrderIndex;
 
@@ -848,19 +846,11 @@ GraphCuRemoveVertex(
 
     if (Removed1 || Removed2) {
         AtomicAggIncCGV(&Graph->DeletedEdgeCount);
-        AtomicAggSubCGV(GraphOrderIndex);
-#if 0
-        OrderIndex = AtomicAggSubCG(GraphOrderIndex);
-        OrderAddress = &Order[OrderIndex];
+        OrderIndex = atomicSub(GraphOrderIndex, 1) - 1;
 
         if (OrderIndex >= 0) {
-            ASSERT(*OrderAddress == 0);
-            ASSERT(Order[OrderIndex] == 0);
-            Order[OrderIndex] = Edge;
-            ASSERT(Order[OrderIndex] == Edge);
-            ASSERT(*OrderAddress == Edge);
+            Order[OrderIndex] = (OrderType)Edge;
         }
-#endif
     }
 
 End:
@@ -897,6 +887,75 @@ GraphCuIsAcyclicPhase1Kernel(
     return;
 }
 
+template<typename GraphType>
+GLOBAL
+VOID
+GraphCuIsAcyclicSerialKernel(
+    GraphType* Graph
+    )
+{
+    using EdgeType = typename GraphType::EdgeType;
+    using Edge3Type = typename GraphType::Edge3Type;
+    using OrderType = typename GraphType::OrderType;
+    using VertexType = typename GraphType::VertexType;
+    using Vertex3Type = typename GraphType::Vertex3Type;
+
+    if (blockIdx.x != 0 || threadIdx.x != 0) {
+        return;
+    }
+
+    auto *Vertices3 = (Vertex3Type *)Graph->Vertices3;
+    auto *Edges3 = (Edge3Type *)Graph->Edges3;
+    auto *Order = (OrderType *)Graph->Order;
+
+    Graph->Flags.Shrinking = TRUE;
+
+    while (TRUE) {
+        bool Progress = false;
+
+        for (uint32_t Index = 0; Index < Graph->NumberOfVertices; Index++) {
+            VertexType VertexIndex = (VertexType)Index;
+            Vertex3Type *Vertex = &Vertices3[VertexIndex];
+
+            if (Vertex->Degree != 1) {
+                continue;
+            }
+
+            EdgeType Edge = Vertex->Edges;
+            Edge3Type *Edge3 = &Edges3[Edge];
+            VertexType Vertex1 = Edge3->Vertex1;
+            VertexType Vertex2 = Edge3->Vertex2;
+
+            if (Vertices3[VertexIndex].Degree != 1) {
+                continue;
+            }
+
+            if (Vertices3[Vertex1].Degree > 0) {
+                --Vertices3[Vertex1].Degree;
+                Vertices3[Vertex1].Edges ^= Edge;
+            }
+
+            if (Vertices3[Vertex2].Degree > 0) {
+                --Vertices3[Vertex2].Degree;
+                Vertices3[Vertex2].Edges ^= Edge;
+            }
+
+            ++Graph->DeletedEdgeCount;
+            Graph->OrderIndex -= 1;
+            Order[Graph->OrderIndex] = (OrderType)Edge;
+            Progress = true;
+        }
+
+        if (!Progress) {
+            break;
+        }
+    }
+
+    if (Graph->DeletedEdgeCount == Graph->NumberOfKeys) {
+        Graph->Flags.IsAcyclic = TRUE;
+    }
+}
+
 EXTERN_C
 HOST
 HRESULT
@@ -908,10 +967,8 @@ GraphCuIsAcyclic(
     )
 {
     BOOLEAN IsAcyclic = FALSE;
-    ULONG Attempts = 0;
-    LONG OrderIndexDelta = 0;
-    LONG PreviousOrderIndex = 0;
     LONG DeviceOrderIndex = 0;
+    ULONG DeviceDeletedEdges = 0;
 
     HRESULT Result;
     PGRAPH DeviceGraph;
@@ -919,28 +976,6 @@ GraphCuIsAcyclic(
     PPH_CU_SOLVE_CONTEXT SolveContext;
     ULONG SharedMemory = SharedMemoryInBytes;
 
-    //
-    // Get suitable launch parameters for the IsAcyclicPhase1() kernel.
-    //
-
-    if (IsUsingAssigned16(Graph)) {
-        Result = GetKernelConfig(Graph,
-                                 (PVOID)GraphCuIsAcyclicPhase1Kernel<GRAPH16>,
-                                 BlocksPerGrid,
-                                 ThreadsPerBlock,
-                                 SharedMemory);
-    } else {
-        Result = GetKernelConfig(Graph,
-                                 (PVOID)GraphCuIsAcyclicPhase1Kernel<GRAPH32>,
-                                 BlocksPerGrid,
-                                 ThreadsPerBlock,
-                                 SharedMemory);
-    }
-
-    if (FAILED(Result)) {
-        goto End;
-    }
-
     //
     // Initialize aliases.
     //
@@ -949,92 +984,44 @@ GraphCuIsAcyclic(
     DeviceGraph = SolveContext->DeviceGraph;
     Stream = (CUstream_st *)SolveContext->Stream;
 
-    //
-    // Enter the kernel launch loop.
-    //
-
-    while (TRUE) {
-
-        ++Attempts;
-
-        //
-        // Dispatch the appropriate kernel and wait for completion.
-        //
-
-        if (IsUsingAssigned16(Graph)) {
-            GraphCuIsAcyclicPhase1Kernel<<<BlocksPerGrid,
-                                           ThreadsPerBlock,
-                                           SharedMemory,
-                                           Stream>>>((PGRAPH16)DeviceGraph);
-        } else {
-            GraphCuIsAcyclicPhase1Kernel<<<BlocksPerGrid,
-                                           ThreadsPerBlock,
-                                           SharedMemory,
-                                           Stream>>>((PGRAPH32)DeviceGraph);
-        }
-
-        CUDA_CALL(cudaMemcpyAsync((PVOID)&DeviceOrderIndex,
-                                  (PVOID)&DeviceGraph->OrderIndex,
-                                  sizeof(DeviceOrderIndex),
-                                  cudaMemcpyDeviceToHost,
-                                  Stream));
-
-        CUDA_CALL(cudaStreamSynchronize(Stream));
-
-        //
-        // Check to see if our OrderIndex has reached 0, this is indicative
-        // of an acyclic graph.  (We use <= 0 because we may see -1 or 0 in
-        // some cases.)
-        //
+    BlocksPerGrid = 1;
+    ThreadsPerBlock = 1;
+    SharedMemory = 0;
 
-        if (DeviceOrderIndex <= 0) {
-
-            //
-            // We were able to delete all vertices with degree 1, therefore,
-            // our graph is acyclic.
-            //
-
-            IsAcyclic = TRUE;
-            break;
-        }
-
-        //
-        // If this is our first pass, capture the OrderIndex as previous and
-        // continue.
-        //
+    if (IsUsingAssigned16(Graph)) {
+        GraphCuIsAcyclicSerialKernel<<<BlocksPerGrid,
+                                       ThreadsPerBlock,
+                                       SharedMemory,
+                                       Stream>>>((PGRAPH16)DeviceGraph);
+    } else {
+        GraphCuIsAcyclicSerialKernel<<<BlocksPerGrid,
+                                       ThreadsPerBlock,
+                                       SharedMemory,
+                                       Stream>>>((PGRAPH32)DeviceGraph);
+    }
 
-        if (Attempts == 1) {
-            PreviousOrderIndex = DeviceOrderIndex;
-            continue;
-        }
+    CUDA_CALL(cudaMemcpyAsync((PVOID)&DeviceOrderIndex,
+                              (PVOID)&DeviceGraph->OrderIndex,
+                              sizeof(DeviceOrderIndex),
+                              cudaMemcpyDeviceToHost,
+                              Stream));
 
-        //
-        // Calculate the delta between the current OrderIndex and what we saw
-        // on the last pass.  If they haven't changed, it means we weren't able
-        // to find any more vertices with degree 1 to delete, which means the
-        // graph isn't acyclic.
-        //
+    CUDA_CALL(cudaMemcpyAsync((PVOID)&DeviceDeletedEdges,
+                              (PVOID)&DeviceGraph->DeletedEdgeCount,
+                              sizeof(DeviceDeletedEdges),
+                              cudaMemcpyDeviceToHost,
+                              Stream));
 
-        OrderIndexDelta = PreviousOrderIndex - DeviceOrderIndex;
-        ASSERT(OrderIndexDelta >= 0);
-        if (OrderIndexDelta == 0) {
-            break;
-        }
+    CUDA_CALL(cudaStreamSynchronize(Stream));
 
-        //
-        // Update previous value and continue for another pass.
-        //
+    Graph->CuIsAcyclicPhase1Attempts = 1;
+    Graph->DeletedEdgeCount = DeviceDeletedEdges;
+    Graph->OrderIndex = DeviceOrderIndex;
 
-        PreviousOrderIndex = DeviceOrderIndex;
+    if (DeviceOrderIndex <= 0) {
+        IsAcyclic = TRUE;
     }
 
-    //
-    // Capture how many attempts were made to determine if the graph was
-    // acyclic.
-    //
-
-    Graph->CuIsAcyclicPhase1Attempts = Attempts;
-
     //
     // Make a note that we're acyclic if applicable in the graph's flags.
     // This is checked by GraphAssign() to ensure we only operate on acyclic

From e5f269670b6e4cbbd2224ed11639db539d70e03c Mon Sep 17 00:00:00 2001
From: Trent Nelson <trent@trent.me>
Date: Sun, 29 Mar 2026 21:01:10 -0700
Subject: [PATCH 02/14] Chm02 CUDA: fix no-file-io completion path

---
 src/PerfectHash/Chm02.c                       | 98 ++++++++++++++++++-
 src/PerfectHash/Graph.c                       | 65 ++++++++++--
 .../run_cli_chm02_cuda_known_seed_test.cmake  | 46 +++++++++
 3 files changed, 198 insertions(+), 11 deletions(-)
 create mode 100644 tests/run_cli_chm02_cuda_known_seed_test.cmake

diff --git a/src/PerfectHash/Chm02.c b/src/PerfectHash/Chm02.c
index 8525a58b..0fda6299 100644
--- a/src/PerfectHash/Chm02.c
+++ b/src/PerfectHash/Chm02.c
@@ -17,6 +17,19 @@ Module Name:
 #include "Chm01.h"
 #include "Chm02Private.h"
 
+FORCEINLINE
+BOOLEAN
+IsChm02CudaDebugEnabled(
+    VOID
+    )
+{
+#ifdef PH_WINDOWS
+    return (GetEnvironmentVariableA("PH_DEBUG_CUDA_CHM02", NULL, 0) > 0);
+#else
+    return (getenv("PH_DEBUG_CUDA_CHM02") != NULL);
+#endif
+}
+
 //
 // Main table creation implementation routine for Chm02.
 //
@@ -129,6 +142,7 @@ Return Value:
     PPERFECT_HASH_CONTEXT Context;
     BOOL WaitForAllEvents = TRUE;
     PERFECT_HASH_TABLE_CREATE_FLAGS TableCreateFlags;
+    BOOLEAN DebugCuda;
     PPH_CU_DEVICE_CONTEXT DeviceContext;
     PPH_CU_DEVICE_CONTEXTS DeviceContexts;
     PPH_CU_SOLVE_CONTEXTS SolveContexts;
@@ -182,6 +196,7 @@ Return Value:
 
     TableCreateFlags.AsULongLong = Table->TableCreateFlags.AsULongLong;
     Silent = (TableCreateFlags.Silent != FALSE);
+    DebugCuda = IsChm02CudaDebugEnabled();
 
     //
     // Initialize variables used if we jump to Error early-on.
@@ -517,6 +532,14 @@ Return Value:
     WaitForThreadpoolWorkCallbacks(Context->MainWork, TRUE);
     WaitForThreadpoolWorkCallbacks(Context->FinishedWork, FALSE);
 
+    if (DebugCuda) {
+        fprintf(stderr,
+                "[Chm02] Threadpool callbacks drained. FinishedCount=%lld "
+                "FirstSolvedWins=%u\n",
+                (long long)Context->FinishedCount,
+                (unsigned)FirstSolvedGraphWins(Context));
+    }
+
     Success = (Context->FinishedCount > 0);
 
     if (!Success && !CtrlCPressed) {
@@ -634,6 +657,14 @@ Return Value:
         ASSERT(ListEntry);
         Graph = CONTAINING_RECORD(ListEntry, GRAPH, ListEntry);
 
+        if (DebugCuda) {
+            fprintf(stderr,
+                    "[Chm02] Pulled winning graph from finished list: %p "
+                    "IsCuGraph=%u\n",
+                    Graph,
+                    (unsigned)IsCuGraph(Graph));
+        }
+
     } else {
 
         EnterCriticalSection(&Context->BestGraphCriticalSection);
@@ -644,6 +675,14 @@ Return Value:
         if (!Graph) {
             goto Error;
         }
+
+        if (DebugCuda) {
+            fprintf(stderr,
+                    "[Chm02] Selected best graph from context: %p "
+                    "IsCuGraph=%u\n",
+                    Graph,
+                    (unsigned)IsCuGraph(Graph));
+        }
     }
 
     //
@@ -677,6 +716,15 @@ Return Value:
 
     Context->SolvedContext = Graph;
 
+    if (DebugCuda) {
+        fprintf(stderr,
+                "[Chm02] Context->SolvedContext set. MaxDepth=%u "
+                "EmptyVertices=%u Collisions=%u\n",
+                (unsigned)Graph->MaximumTraversalDepth,
+                (unsigned)Graph->NumberOfEmptyVertices,
+                (unsigned)Graph->Collisions);
+    }
+
     //
     // Graphs always pass verification in normal circumstances.  The only time
     // they don't is if there's an internal bug in our code.  So, knowing that
@@ -695,6 +743,15 @@ Return Value:
 
     } else {
 
+        if (DebugCuda) {
+            fprintf(stderr,
+                    "[Chm02] NoFileIo path entered. IsCuGraph=%u "
+                    "AssignedElementSize=%u NumberOfTableElements=%llu\n",
+                    (unsigned)IsCuGraph(Graph),
+                    (unsigned)TableInfoOnDisk->AssignedElementSizeInBytes,
+                    (unsigned long long)TableInfoOnDisk->NumberOfTableElements.QuadPart);
+        }
+
         PGRAPH_INFO_ON_DISK NewGraphInfoOnDisk;
 
         //
@@ -727,6 +784,10 @@ Return Value:
                    GraphInfoOnDisk,
                    sizeof(*NewGraphInfoOnDisk));
 
+        if (DebugCuda) {
+            fprintf(stderr, "[Chm02] Copied GraphInfoOnDisk.\n");
+        }
+
         //
         // Sanity check the first seed is not 0.
         //
@@ -745,6 +806,13 @@ Return Value:
                    &Graph->FirstSeed,
                    Graph->NumberOfSeeds * sizeof(Graph->FirstSeed));
 
+        if (DebugCuda) {
+            fprintf(stderr,
+                    "[Chm02] Copied seeds. FirstSeed=%u NumberOfSeeds=%u\n",
+                    (unsigned)Graph->FirstSeed,
+                    (unsigned)Graph->NumberOfSeeds);
+        }
+
         //
         // Switch the pointers.
         //
@@ -783,7 +851,7 @@ Return Value:
 
             SizeInBytes = (
                 TableInfoOnDisk->NumberOfTableElements.QuadPart *
-                TableInfoOnDisk->KeySizeInBytes
+                TableInfoOnDisk->AssignedElementSizeInBytes
             );
 
             TryLargePageVirtualAlloc = Rtl->Vtbl->TryLargePageVirtualAlloc;
@@ -795,6 +863,15 @@ Return Value:
                                                    &LargePagesForTableData);
 
             Table->TableDataBaseAddress = BaseAddress;
+            Table->TableDataSizeInBytes = SizeInBytes;
+
+            if (DebugCuda) {
+                fprintf(stderr,
+                        "[Chm02] Allocated table data SizeInBytes=%lld "
+                        "BaseAddress=%p\n",
+                        (long long)SizeInBytes,
+                        BaseAddress);
+            }
 
             if (!BaseAddress) {
                 Result = E_OUTOFMEMORY;
@@ -821,6 +898,10 @@ Return Value:
                        Graph->Assigned,
                        SizeInBytes);
 
+            if (DebugCuda) {
+                fprintf(stderr, "[Chm02] Copied table data from Graph->Assigned.\n");
+            }
+
         }
 
     }
@@ -832,10 +913,18 @@ Return Value:
 
     CONTEXT_START_TIMERS(Verify);
 
+    if (DebugCuda) {
+        fprintf(stderr, "[Chm02] About to call Graph->Verify().\n");
+    }
+
     Result = Graph->Vtbl->Verify(Graph);
 
     CONTEXT_END_TIMERS(Verify);
 
+    if (DebugCuda) {
+        fprintf(stderr, "[Chm02] Verify result=0x%08x\n", (unsigned)Result);
+    }
+
     //
     // Set the verified table event (regardless of whether or not we succeeded
     // in verification).  The save file work will be waiting upon it in order to
@@ -848,12 +937,19 @@ Return Value:
         goto Error;
     }
 
+    if (DebugCuda) {
+        fprintf(stderr, "[Chm02] VerifiedTableEvent signaled.\n");
+    }
+
     if (FAILED(Result)) {
         Result = PH_E_TABLE_VERIFICATION_FAILED;
         goto Error;
     }
 
     if (NoFileIo(Table)) {
+        if (DebugCuda) {
+            fprintf(stderr, "[Chm02] NoFileIo exit after verify.\n");
+        }
         goto End;
     }
 
diff --git a/src/PerfectHash/Graph.c b/src/PerfectHash/Graph.c
index 9bb8d30b..bd35725b 100644
--- a/src/PerfectHash/Graph.c
+++ b/src/PerfectHash/Graph.c
@@ -3864,15 +3864,20 @@ Return Value:
         ASSERT(Context->NewBestGraphCount == 0);
         ASSERT(Context->FirstAttemptSolved == 0);
         SpareGraph = Context->SpareGraph;
-        ASSERT(SpareGraph != NULL);
-        ASSERT(IsSpareGraph(SpareGraph));
-        SpareGraph->Flags.IsSpare = FALSE;
-        Context->SpareGraph = NULL;
         Context->BestGraph = Graph;
         FoundBestGraph = TRUE;
         Coverage->BestGraphNumber = ++Context->NewBestGraphCount;
         Context->FirstAttemptSolved = Graph->Attempt;
-        Result = PH_S_USE_NEW_GRAPH_FOR_SOLVING;
+
+        if (SpareGraph != NULL) {
+            ASSERT(IsSpareGraph(SpareGraph));
+            SpareGraph->Flags.IsSpare = FALSE;
+            Context->SpareGraph = NULL;
+            Result = PH_S_USE_NEW_GRAPH_FOR_SOLVING;
+        } else {
+            StopGraphSolving = TRUE;
+            Result = PH_S_STOP_GRAPH_SOLVING;
+        }
     }
 
     //
@@ -6959,6 +6964,7 @@ Return Value:
     PPERFECT_HASH_CONTEXT Context;
     PASSIGNED16_MEMORY_COVERAGE Coverage;
     PERFECT_HASH_TABLE_BEST_COVERAGE_TYPE_ID CoverageType;
+    BOOLEAN DebugCudaSolve;
 
     //
     // Initialize aliases.
@@ -6969,6 +6975,22 @@ Return Value:
     CoverageType = Context->BestCoverageType;
     Attempt = Coverage->Attempt;
     ElapsedMilliseconds = GetTickCount64() - Context->StartMilliseconds;
+    DebugCudaSolve = FALSE;
+
+#ifdef PH_WINDOWS
+    DebugCudaSolve = (GetEnvironmentVariableA("PH_DEBUG_CUDA_CHM02", NULL, 0) > 0);
+#else
+    DebugCudaSolve = (getenv("PH_DEBUG_CUDA_CHM02") != NULL);
+#endif
+
+    if (DebugCudaSolve && IsCuGraph(Graph)) {
+        fprintf(stderr,
+                "[GraphRegisterSolved16NoBestCoverage] Enter "
+                "BestGraph=%p SpareGraph=%p SolutionNumber=%llu\n",
+                Context->BestGraph,
+                Context->SpareGraph,
+                (unsigned long long)Graph->SolutionNumber);
+    }
 
     //
     // Indicate continue graph solving by default.
@@ -7002,15 +7024,19 @@ Return Value:
         ASSERT(Context->NewBestGraphCount == 0);
         ASSERT(Context->FirstAttemptSolved == 0);
         SpareGraph = Context->SpareGraph;
-        ASSERT(SpareGraph != NULL);
-        ASSERT(IsSpareGraph(SpareGraph));
-        SpareGraph->Flags.IsSpare = FALSE;
-        Context->SpareGraph = NULL;
         Context->BestGraph = Graph;
         FoundBestGraph = TRUE;
         Coverage->BestGraphNumber = ++Context->NewBestGraphCount;
         Context->FirstAttemptSolved = Graph->Attempt;
-        Result = PH_S_USE_NEW_GRAPH_FOR_SOLVING;
+        if (SpareGraph != NULL) {
+            ASSERT(IsSpareGraph(SpareGraph));
+            SpareGraph->Flags.IsSpare = FALSE;
+            Context->SpareGraph = NULL;
+            Result = PH_S_USE_NEW_GRAPH_FOR_SOLVING;
+        } else {
+            StopGraphSolving = TRUE;
+            Result = PH_S_STOP_GRAPH_SOLVING;
+        }
     }
 
     //
@@ -7019,6 +7045,16 @@ Return Value:
 
     LeaveCriticalSection(&Context->BestGraphCriticalSection);
 
+    if (DebugCudaSolve && IsCuGraph(Graph)) {
+        fprintf(stderr,
+                "[GraphRegisterSolved16NoBestCoverage] AfterCS "
+                "FoundBestGraph=%u Result=0x%08x SpareGraph=%p BestGraph=%p\n",
+                (unsigned)FoundBestGraph,
+                (unsigned)Result,
+                SpareGraph,
+                Context->BestGraph);
+    }
+
 End:
 
     if (Context->TargetNumberOfSolutions > 0) {
@@ -7048,6 +7084,15 @@ Return Value:
         *NewGraphPointer = SpareGraph;
     }
 
+    if (DebugCudaSolve && IsCuGraph(Graph)) {
+        fprintf(stderr,
+                "[GraphRegisterSolved16NoBestCoverage] Exit Result=0x%08x "
+                "NewGraph=%p Stop=%u\n",
+                (unsigned)Result,
+                *NewGraphPointer,
+                (unsigned)StopGraphSolving);
+    }
+
     EVENT_WRITE_GRAPH_FOUND(Found);
 
     return Result;
diff --git a/tests/run_cli_chm02_cuda_known_seed_test.cmake b/tests/run_cli_chm02_cuda_known_seed_test.cmake
new file mode 100644
index 00000000..2426b3f5
--- /dev/null
+++ b/tests/run_cli_chm02_cuda_known_seed_test.cmake
@@ -0,0 +1,46 @@
+if(NOT DEFINED TEST_EXE)
+  message(FATAL_ERROR "TEST_EXE is required")
+endif()
+if(NOT DEFINED TEST_KEYS)
+  message(FATAL_ERROR "TEST_KEYS is required")
+endif()
+if(NOT DEFINED TEST_OUTPUT)
+  message(FATAL_ERROR "TEST_OUTPUT is required")
+endif()
+
+file(TO_NATIVE_PATH "${TEST_EXE}" test_exe_native)
+file(TO_NATIVE_PATH "${TEST_KEYS}" test_keys_native)
+file(TO_NATIVE_PATH "${TEST_OUTPUT}" test_output_native)
+
+file(MAKE_DIRECTORY "${test_output_native}")
+
+set(args
+  "Chm02"
+  "Mulshrolate3RX"
+  "And"
+  "1"
+  "--CuConcurrency=1"
+  "--FixedAttempts=2"
+  "--Seeds=0xF0192B55,0xD9C83970,0x0C1E0D10,0xD11A5847"
+  "--NoFileIo"
+  "--DisableCsvOutputFile"
+)
+
+execute_process(
+  COMMAND "${test_exe_native}" "${test_keys_native}" "${test_output_native}" ${args}
+  RESULT_VARIABLE result
+  OUTPUT_VARIABLE stdout
+  ERROR_VARIABLE stderr
+)
+
+message(STATUS "stdout: ${stdout}")
+message(STATUS "stderr: ${stderr}")
+
+if(NOT result EQUAL 0)
+  message(FATAL_ERROR "Command failed with exit code ${result}")
+endif()
+
+string(FIND "${stderr}" "PerfectHashTableCreate failed" failure_index)
+if(NOT failure_index EQUAL -1)
+  message(FATAL_ERROR "Expected Chm02 CUDA known-seed run to succeed, but stderr reported failure.")
+endif()

From 1425eaa0ae970ee9c8e10279df695d026ea4c481 Mon Sep 17 00:00:00 2001
From: Trent Nelson <trent@trent.me>
Date: Sun, 29 Mar 2026 21:01:35 -0700
Subject: [PATCH 03/14] Chm02 CUDA: add GPU assignment and verify path

---
 src/PerfectHash/Cu.h                          |  12 ++
 src/PerfectHash/GraphCu.c                     |  17 ++-
 src/PerfectHashCuda/Graph.cu                  | 121 +++++++++++++++++-
 src/PerfectHashCuda/PerfectHashCuda.def       |   1 +
 .../run_cli_chm02_cuda_known_seed_test.cmake  |  15 ++-
 5 files changed, 157 insertions(+), 9 deletions(-)

diff --git a/src/PerfectHash/Cu.h b/src/PerfectHash/Cu.h
index f86dbaac..8b696653 100644
--- a/src/PerfectHash/Cu.h
+++ b/src/PerfectHash/Cu.h
@@ -2113,6 +2113,16 @@ HRESULT
     );
 typedef PERFECT_HASH_CUDA_ADD_KEYS *PPERFECT_HASH_CUDA_ADD_KEYS;
 
+typedef
+HRESULT
+(PERFECT_HASH_CUDA_ASSIGN)(
+    _Inout_ struct _GRAPH *Graph,
+    _In_ ULONG BlocksPerGrid,
+    _In_ ULONG ThreadsPerBlock,
+    _In_ ULONG SharedMemoryInBytes
+    );
+typedef PERFECT_HASH_CUDA_ASSIGN *PPERFECT_HASH_CUDA_ASSIGN;
+
 typedef
 HRESULT
 (PERFECT_HASH_CUDA_IS_GRAPH_ACYCLIC)(
@@ -2142,6 +2152,8 @@ typedef PERFECT_HASH_CUDA_COUNT_NONEMPTY
                                                                          \
     FIRST_ENTRY(ADD_KEYS, AddKeys)                                       \
                                                                          \
+    ENTRY(ASSIGN, Assign)                                                \
+                                                                         \
     LAST_ENTRY(IS_GRAPH_ACYCLIC, IsAcyclic)
 
 #define PERFECT_HASH_CUDA_FUNCTION_TABLE_ENTRY(ENTRY) \
diff --git a/src/PerfectHash/GraphCu.c b/src/PerfectHash/GraphCu.c
index 4ec65ab2..c7cb8c0a 100644
--- a/src/PerfectHash/GraphCu.c
+++ b/src/PerfectHash/GraphCu.c
@@ -1583,7 +1583,7 @@ GraphCuAssign(
     _In_ PGRAPH Graph
     )
 {
-    PRTL Rtl;
+    PCU Cu;
     HRESULT Result;
 
     if (IsCudaDebugEnabled()) {
@@ -1594,17 +1594,22 @@ GraphCuAssign(
                 (long)Graph->CpuGraph->OrderIndex);
     }
 
-    Result = Graph->CpuGraph->Vtbl->Assign(Graph->CpuGraph);
+    Cu = Graph->CuSolveContext->DeviceContext->Cu;
+
+    Result = Cu->Assign(Graph,
+                        Graph->CuBlocksPerGrid,
+                        Graph->CuThreadsPerBlock,
+                        Graph->CuSharedMemory);
+
     if (IsCudaDebugEnabled()) {
-        fprintf(stderr, "[GraphCuAssign] CpuAssignResult=0x%08x\n", (unsigned)Result);
+        fprintf(stderr, "[GraphCuAssign] GpuAssignResult=0x%08x\n", (unsigned)Result);
     }
     if (FAILED(Result)) {
         return Result;
     }
 
-    Rtl = Graph->Context->Rtl;
-    CopyMemory(Graph->Assigned,
-               Graph->CpuGraph->Assigned,
+    CopyMemory(Graph->CpuGraph->Assigned,
+               Graph->Assigned,
                Graph->Info->AssignedSizeInBytes);
 
     return Result;
diff --git a/src/PerfectHashCuda/Graph.cu b/src/PerfectHashCuda/Graph.cu
index 9dfbe758..86211ea9 100644
--- a/src/PerfectHashCuda/Graph.cu
+++ b/src/PerfectHashCuda/Graph.cu
@@ -956,6 +956,87 @@ GraphCuIsAcyclicSerialKernel(
     }
 }
 
+template<typename GraphType>
+DEVICE
+bool
+GraphCuIsVisitedVertex(
+    _In_ GraphType *Graph,
+    _In_ typename GraphType::VertexType Vertex
+    )
+{
+    auto *Bitmap = (uint64_t *)Graph->VisitedVerticesBitmap.Buffer;
+    const uint64_t Mask = (1ull << ((uint64_t)Vertex & 63ull));
+    return ((Bitmap[(uint64_t)Vertex >> 6] & Mask) != 0);
+}
+
+template<typename GraphType>
+DEVICE
+void
+GraphCuRegisterVertexVisit(
+    _In_ GraphType *Graph,
+    _In_ typename GraphType::VertexType Vertex
+    )
+{
+    auto *Bitmap = (uint64_t *)Graph->VisitedVerticesBitmap.Buffer;
+    const uint64_t Mask = (1ull << ((uint64_t)Vertex & 63ull));
+    Bitmap[(uint64_t)Vertex >> 6] |= Mask;
+}
+
+template<typename GraphType>
+GLOBAL
+VOID
+GraphCuAssignSerialKernel(
+    _In_ GraphType *Graph
+    )
+{
+    using AssignedType = typename GraphType::AssignedType;
+    using Edge3Type = typename GraphType::Edge3Type;
+    using EdgeType = typename GraphType::EdgeType;
+    using OrderType = typename GraphType::OrderType;
+    using VertexType = typename GraphType::VertexType;
+
+    if (blockIdx.x != 0 || threadIdx.x != 0) {
+        return;
+    }
+
+    auto *Assigned = (AssignedType *)Graph->Assigned;
+    auto *Edges3 = (Edge3Type *)Graph->Edges3;
+    auto *Order = (OrderType *)Graph->Order;
+    const EdgeType NumberOfEdges = (EdgeType)Graph->NumberOfEdges;
+
+    ASSERT(Graph->Flags.IsAcyclic);
+
+    for (uint32_t Index = (uint32_t)Graph->OrderIndex;
+         Index < Graph->NumberOfKeys;
+         Index++)
+    {
+        const EdgeType Edge = (EdgeType)Order[Index];
+        const Edge3Type *Edge3 = &Edges3[Edge];
+        VertexType Vertex1;
+        VertexType Vertex2;
+        AssignedType Value;
+
+        if (!GraphCuIsVisitedVertex(Graph, Edge3->Vertex1)) {
+            Vertex1 = Edge3->Vertex1;
+            Vertex2 = Edge3->Vertex2;
+        } else {
+            Vertex1 = Edge3->Vertex2;
+            Vertex2 = Edge3->Vertex1;
+        }
+
+        Value = (AssignedType)(Edge - Assigned[Vertex2]);
+        if (Value >= NumberOfEdges) {
+            Value = (AssignedType)(Value + NumberOfEdges);
+        }
+
+        ASSERT(Assigned[Vertex1] == INITIAL_ASSIGNMENT_VALUE);
+        Assigned[Vertex1] = Value;
+
+        GraphCuRegisterVertexVisit(Graph, Vertex1);
+        GraphCuRegisterVertexVisit(Graph, Vertex2);
+    }
+}
+
 EXTERN_C
 HOST
 HRESULT
@@ -970,7 +1051,6 @@ GraphCuIsAcyclic(
     LONG DeviceOrderIndex = 0;
     ULONG DeviceDeletedEdges = 0;
 
-    HRESULT Result;
     PGRAPH DeviceGraph;
     CUstream_st* Stream;
     PPH_CU_SOLVE_CONTEXT SolveContext;
@@ -1065,8 +1145,45 @@ GraphCuIsAcyclic(
         }
     }
 
-End:
     return (IsAcyclic ? S_OK : PH_E_GRAPH_CYCLIC_FAILURE);
 }
 
+EXTERN_C
+HOST
+HRESULT
+GraphCuAssign(
+    _In_ PGRAPH Graph,
+    _In_ ULONG BlocksPerGrid,
+    _In_ ULONG ThreadsPerBlock,
+    _In_ ULONG SharedMemoryInBytes
+    )
+{
+    HRESULT Result = S_OK;
+    PGRAPH DeviceGraph;
+    CUstream_st *Stream;
+    PPH_CU_SOLVE_CONTEXT SolveContext;
+
+    (void)BlocksPerGrid;
+    (void)ThreadsPerBlock;
+    (void)SharedMemoryInBytes;
+
+    SolveContext = Graph->CuSolveContext;
+    DeviceGraph = SolveContext->DeviceGraph;
+    Stream = (CUstream_st *)SolveContext->Stream;
+
+    if (IsUsingAssigned16(Graph)) {
+        GraphCuAssignSerialKernel<GRAPH16><<<1, 1, 0, Stream>>>(
+            (PGRAPH16)DeviceGraph
+        );
+    } else {
+        GraphCuAssignSerialKernel<GRAPH32><<<1, 1, 0, Stream>>>(
+            (PGRAPH32)DeviceGraph
+        );
+    }
+
+    CUDA_CALL(cudaStreamSynchronize(Stream));
+
+    return Result;
+}
+
 // vim:set ts=8 sw=4 sts=4 tw=80 expandtab filetype=cuda formatoptions=croql   :
diff --git a/src/PerfectHashCuda/PerfectHashCuda.def b/src/PerfectHashCuda/PerfectHashCuda.def
index dd6b2c58..4b85ae06 100644
--- a/src/PerfectHashCuda/PerfectHashCuda.def
+++ b/src/PerfectHashCuda/PerfectHashCuda.def
@@ -2,3 +2,4 @@ LIBRARY PerfectHashCuda
 EXPORTS
     GraphCuIsAcyclic
     GraphCuAddKeys
+    GraphCuAssign
diff --git a/tests/run_cli_chm02_cuda_known_seed_test.cmake b/tests/run_cli_chm02_cuda_known_seed_test.cmake
index 2426b3f5..ecda4c4d 100644
--- a/tests/run_cli_chm02_cuda_known_seed_test.cmake
+++ b/tests/run_cli_chm02_cuda_known_seed_test.cmake
@@ -27,7 +27,8 @@ set(args
 )
 
 execute_process(
-  COMMAND "${test_exe_native}" "${test_keys_native}" "${test_output_native}" ${args}
+  COMMAND ${CMAKE_COMMAND} -E env PH_DEBUG_CUDA_CHM02=1
+          "${test_exe_native}" "${test_keys_native}" "${test_output_native}" ${args}
   RESULT_VARIABLE result
   OUTPUT_VARIABLE stdout
   ERROR_VARIABLE stderr
@@ -44,3 +45,15 @@ string(FIND "${stderr}" "PerfectHashTableCreate failed" failure_index)
 if(NOT failure_index EQUAL -1)
   message(FATAL_ERROR "Expected Chm02 CUDA known-seed run to succeed, but stderr reported failure.")
 endif()
+
+if(DEFINED REQUIRE_GPU_ASSIGN AND REQUIRE_GPU_ASSIGN)
+  string(FIND "${stderr}" "[GraphCuAssign] GpuAssignResult=0x00000000" gpu_assign_index)
+  if(gpu_assign_index EQUAL -1)
+    message(FATAL_ERROR "Expected GPU assignment success log, but it was not present.")
+  endif()
+
+  string(FIND "${stderr}" "[GraphCuAssign] CpuAssignResult=" cpu_assign_index)
+  if(NOT cpu_assign_index EQUAL -1)
+    message(FATAL_ERROR "Expected GPU assignment path without CPU assign fallback, but CPU assign log was present.")
+  endif()
+endif()

From 0d099766234e3a883956f5cbfbeeb206b06942f3 Mon Sep 17 00:00:00 2001
From: Trent Nelson <trent@trent.me>
Date: Sun, 29 Mar 2026 21:01:57 -0700
Subject: [PATCH 04/14] Chm02 CUDA: fix Linux compat file-work path

---
 src/PerfectHash/Chm01FileWork.c               |  7 ++-
 src/PerfectHash/Chm01FileWorkStub.c           |  4 +-
 src/PerfectHash/Chm02Compat.c                 |  2 +
 .../run_cli_chm02_cuda_known_seed_test.cmake  | 59 ++++++++++++++++---
 4 files changed, 58 insertions(+), 14 deletions(-)

diff --git a/src/PerfectHash/Chm01FileWork.c b/src/PerfectHash/Chm01FileWork.c
index 3c57546a..7a3c9a2f 100644
--- a/src/PerfectHash/Chm01FileWork.c
+++ b/src/PerfectHash/Chm01FileWork.c
@@ -54,7 +54,6 @@ PERFECT_HASH_FILE_WORK_ITEM_CALLBACK FileWorkItemCallbackChm01;
 // Begin method implementations.
 //
 
-#ifdef PH_WINDOWS
 PERFECT_HASH_FILE_WORK_CALLBACK FileWorkCallbackChm01;
 
 _Use_decl_annotations_
@@ -88,13 +87,17 @@ Return Value:
 {
     PFILE_WORK_ITEM Item;
 
+    if (!ARGUMENT_PRESENT(ListEntry)) {
+        return;
+    }
+
     Item = CONTAINING_RECORD(ListEntry, FILE_WORK_ITEM, ListEntry);
+
     Item->Instance = Instance;
     Item->Context = Context;
 
     FileWorkItemCallbackChm01(Item);
 }
-#endif
 
 _Use_decl_annotations_
 VOID
diff --git a/src/PerfectHash/Chm01FileWorkStub.c b/src/PerfectHash/Chm01FileWorkStub.c
index e9ea6f81..b5ab2f5d 100644
--- a/src/PerfectHash/Chm01FileWorkStub.c
+++ b/src/PerfectHash/Chm01FileWorkStub.c
@@ -16,7 +16,6 @@ Module Name:
 
 #ifdef PH_ONLINE_ONLY
 
-#ifdef PH_WINDOWS
 PERFECT_HASH_FILE_WORK_ITEM_CALLBACK FileWorkItemCallbackChm01;
 
 PERFECT_HASH_FILE_WORK_CALLBACK FileWorkCallbackChm01;
@@ -41,7 +40,6 @@ FileWorkCallbackChm01(
 
     FileWorkItemCallbackChm01(Item);
 }
-#endif
 
 _Use_decl_annotations_
 VOID
@@ -60,4 +58,4 @@ FileWorkItemCallbackChm01(
 
 #endif // PH_ONLINE_ONLY
 
-// vim:set ts=8 sw=4 sts=4 tw=80 expandtab                                     :
\ No newline at end of file
+// vim:set ts=8 sw=4 sts=4 tw=80 expandtab                                     :
diff --git a/src/PerfectHash/Chm02Compat.c b/src/PerfectHash/Chm02Compat.c
index 97302eeb..67c61732 100644
--- a/src/PerfectHash/Chm02Compat.c
+++ b/src/PerfectHash/Chm02Compat.c
@@ -375,6 +375,7 @@ Return Value:
     ASSERT(!NoFileIo(Table));                         \
     ZeroStructInline(Verb##Name);                     \
     Verb##Name.FileWorkId = FileWork##Verb##Name##Id; \
+    Verb##Name.Context = Context;                     \
     ThreadpoolAddWork(FileWorkThreadpool,             \
                       FileWorkItemCallbackChm01,      \
                       &Verb##Name);
@@ -1010,6 +1011,7 @@ Return Value:
     ASSERT(!NoFileIo(Table));                           \
     ZeroStructInline(Verb##Name);                       \
     Verb##Name.FileWorkId = FileWork##Verb##Name##Id;   \
+    Verb##Name.Context = Context;                       \
     Verb##Name.EndOfFile = EndOfFile;                   \
     ThreadpoolAddWork(FileWorkThreadpool,               \
                       FileWorkItemCallbackChm01,        \
diff --git a/tests/run_cli_chm02_cuda_known_seed_test.cmake b/tests/run_cli_chm02_cuda_known_seed_test.cmake
index ecda4c4d..ec60a144 100644
--- a/tests/run_cli_chm02_cuda_known_seed_test.cmake
+++ b/tests/run_cli_chm02_cuda_known_seed_test.cmake
@@ -14,18 +14,59 @@ file(TO_NATIVE_PATH "${TEST_OUTPUT}" test_output_native)
 
 file(MAKE_DIRECTORY "${test_output_native}")
 
+set(test_algorithm "Chm02")
+if(DEFINED TEST_ALGORITHM)
+  set(test_algorithm "${TEST_ALGORITHM}")
+endif()
+
+set(test_hash "Mulshrolate3RX")
+if(DEFINED TEST_HASH)
+  set(test_hash "${TEST_HASH}")
+endif()
+
+set(test_mask "And")
+if(DEFINED TEST_MASK)
+  set(test_mask "${TEST_MASK}")
+endif()
+
+set(test_concurrency "1")
+if(DEFINED TEST_CONCURRENCY)
+  set(test_concurrency "${TEST_CONCURRENCY}")
+endif()
+
+set(test_cu_concurrency "--CuConcurrency=1")
+if(DEFINED TEST_CU_CONCURRENCY)
+  set(test_cu_concurrency "${TEST_CU_CONCURRENCY}")
+endif()
+
+set(test_fixed_attempts "--FixedAttempts=2")
+if(DEFINED TEST_FIXED_ATTEMPTS)
+  set(test_fixed_attempts "${TEST_FIXED_ATTEMPTS}")
+endif()
+
+set(test_seeds "--Seeds=0xF0192B55,0xD9C83970,0x0C1E0D10,0xD11A5847")
+if(DEFINED TEST_SEEDS)
+  set(test_seeds "${TEST_SEEDS}")
+endif()
+
 set(args
-  "Chm02"
-  "Mulshrolate3RX"
-  "And"
-  "1"
-  "--CuConcurrency=1"
-  "--FixedAttempts=2"
-  "--Seeds=0xF0192B55,0xD9C83970,0x0C1E0D10,0xD11A5847"
-  "--NoFileIo"
-  "--DisableCsvOutputFile"
+  "${test_algorithm}"
+  "${test_hash}"
+  "${test_mask}"
+  "${test_concurrency}"
+  "${test_cu_concurrency}"
+  "${test_fixed_attempts}"
+  "${test_seeds}"
 )
 
+if(DEFINED TEST_FLAGS)
+  foreach(flag IN LISTS TEST_FLAGS)
+    list(APPEND args "${flag}")
+  endforeach()
+else()
+  list(APPEND args "--NoFileIo" "--DisableCsvOutputFile")
+endif()
+
 execute_process(
   COMMAND ${CMAKE_COMMAND} -E env PH_DEBUG_CUDA_CHM02=1
           "${test_exe_native}" "${test_keys_native}" "${test_output_native}" ${args}

From 9daa95787ebae9d7e2e96851d755d1a680cf7bd0 Mon Sep 17 00:00:00 2001
From: Trent Nelson <trent@trent.me>
Date: Sun, 29 Mar 2026 21:02:18 -0700
Subject: [PATCH 05/14] Chm02 CUDA: validate peel order against CPU oracle

---
 src/PerfectHash/GraphCu.c                     | 234 ++++++++++++++++++
 .../run_cli_chm02_cuda_known_seed_test.cmake  |   7 +
 2 files changed, 241 insertions(+)

diff --git a/src/PerfectHash/GraphCu.c b/src/PerfectHash/GraphCu.c
index c7cb8c0a..ac038ec9 100644
--- a/src/PerfectHash/GraphCu.c
+++ b/src/PerfectHash/GraphCu.c
@@ -1330,6 +1330,197 @@ IsCudaDebugEnabled(
 #endif
 }
 
+#define DEFINE_VALIDATE_GPU_ORDER(Name,                               \
+                                  OrderType,                          \
+                                  EdgeType,                           \
+                                  Vertex3Type,                        \
+                                  Edge3Type,                          \
+                                  OrderField,                         \
+                                  VerticesField,                      \
+                                  EdgesField)                         \
+static                                                           \
+HRESULT                                                          \
+Name(                                                            \
+    _In_ PGRAPH Graph,                                           \
+    _Out_opt_ PULONG InvalidIndexPointer,                        \
+    _Out_opt_ PLONG InvalidEdgePointer,                          \
+    _Out_opt_ PULONG Degree1Pointer,                             \
+    _Out_opt_ PULONG Degree2Pointer,                             \
+    _Out_opt_ PCSTR *ReasonPointer                               \
+    )                                                            \
+{                                                                \
+    LONG SignedEdge;                                             \
+    ULONG Index;                                                 \
+    ULONG NumberOfKeys;                                          \
+    ULONG NumberOfVertices;                                      \
+    PGRAPH CpuGraph;                                             \
+    PBYTE Seen = NULL;                                           \
+    HRESULT Result = S_OK;                                       \
+    OrderType *Order;                                            \
+    EdgeType Edge;                                               \
+    Edge3Type *Edge3 = NULL;                                     \
+    Edge3Type *Edges;                                            \
+    Vertex3Type *Vertex1 = NULL;                                 \
+    Vertex3Type *Vertex2 = NULL;                                 \
+    Vertex3Type *ScratchVertices = NULL;                         \
+                                                                 \
+    CpuGraph = Graph->CpuGraph;                                  \
+    NumberOfKeys = CpuGraph->NumberOfKeys;                       \
+    NumberOfVertices = CpuGraph->NumberOfVertices;               \
+    Order = (OrderType *)Graph->OrderField;                      \
+    Edges = (Edge3Type *)CpuGraph->EdgesField;                   \
+                                                                 \
+    if (InvalidIndexPointer) {                                   \
+        *InvalidIndexPointer = (ULONG)-1;                        \
+    }                                                            \
+    if (InvalidEdgePointer) {                                    \
+        *InvalidEdgePointer = -1;                                \
+    }                                                            \
+    if (Degree1Pointer) {                                        \
+        *Degree1Pointer = 0;                                     \
+    }                                                            \
+    if (Degree2Pointer) {                                        \
+        *Degree2Pointer = 0;                                     \
+    }                                                            \
+    if (ReasonPointer) {                                         \
+        *ReasonPointer = "unknown";                              \
+    }                                                            \
+                                                                 \
+    ScratchVertices = (Vertex3Type *)(                           \
+        calloc(NumberOfVertices, sizeof(*ScratchVertices))        \
+    );                                                           \
+    Seen = (PBYTE)calloc(NumberOfKeys, sizeof(*Seen));           \
+                                                                 \
+    if (!ScratchVertices || !Seen) {                             \
+        Result = E_OUTOFMEMORY;                                  \
+        if (ReasonPointer) {                                     \
+            *ReasonPointer = "oom";                              \
+        }                                                        \
+        goto End;                                                \
+    }                                                            \
+                                                                 \
+    CopyMemory(ScratchVertices,                                  \
+               CpuGraph->VerticesField,                          \
+               NumberOfVertices * sizeof(*ScratchVertices));      \
+                                                                 \
+    for (Index = NumberOfKeys; Index > 0; Index--) {             \
+        ULONG OrderIndex;                                        \
+                                                                 \
+        OrderIndex = Index - 1;                                  \
+                                                                 \
+        SignedEdge = (LONG)Order[OrderIndex];                    \
+                                                                 \
+        if (SignedEdge < 0 || (ULONG)SignedEdge >= NumberOfKeys) { \
+            Result = PH_E_INVARIANT_CHECK_FAILED;                \
+            if (ReasonPointer) {                                 \
+                *ReasonPointer = "out_of_range";                 \
+            }                                                    \
+            goto Invalid;                                        \
+        }                                                        \
+                                                                 \
+        Edge = (EdgeType)SignedEdge;                             \
+                                                                 \
+        if (Seen[Edge]) {                                        \
+            Result = PH_E_INVARIANT_CHECK_FAILED;                \
+            if (ReasonPointer) {                                 \
+                *ReasonPointer = "duplicate_edge";               \
+            }                                                    \
+            goto Invalid;                                        \
+        }                                                        \
+                                                                 \
+        Seen[Edge] = TRUE;                                       \
+        Edge3 = &Edges[Edge];                                    \
+        Vertex1 = &ScratchVertices[Edge3->Vertex1];              \
+        Vertex2 = &ScratchVertices[Edge3->Vertex2];              \
+                                                                 \
+        if (Vertex1->Degree == 0 || Vertex2->Degree == 0) {      \
+            Result = PH_E_INVARIANT_CHECK_FAILED;                \
+            if (ReasonPointer) {                                 \
+                *ReasonPointer = "edge_already_removed";         \
+            }                                                    \
+            goto Invalid;                                        \
+        }                                                        \
+                                                                 \
+        if (Vertex1->Degree != 1 && Vertex2->Degree != 1) {      \
+            Result = PH_E_INVARIANT_CHECK_FAILED;                \
+            if (ReasonPointer) {                                 \
+                *ReasonPointer = "no_degree1_endpoint";          \
+            }                                                    \
+            goto Invalid;                                        \
+        }                                                        \
+                                                                 \
+        Vertex1->Degree -= 1;                                    \
+        Vertex1->Edges ^= Edge;                                  \
+        Vertex2->Degree -= 1;                                    \
+        Vertex2->Edges ^= Edge;                                  \
+    }                                                            \
+                                                                 \
+    for (Index = 0; Index < NumberOfVertices; Index++) {         \
+        if (ScratchVertices[Index].Degree != 0) {                \
+            Result = PH_E_INVARIANT_CHECK_FAILED;                \
+            if (ReasonPointer) {                                 \
+                *ReasonPointer = "residual_degree";              \
+            }                                                    \
+            if (InvalidIndexPointer) {                           \
+                *InvalidIndexPointer = Index;                    \
+            }                                                    \
+            if (Degree1Pointer) {                                \
+                *Degree1Pointer = ScratchVertices[Index].Degree; \
+            }                                                    \
+            if (InvalidEdgePointer) {                            \
+                *InvalidEdgePointer = -1;                        \
+            }                                                    \
+            goto End;                                            \
+        }                                                        \
+    }                                                            \
+                                                                 \
+    if (ReasonPointer) {                                         \
+        *ReasonPointer = "ok";                                   \
+    }                                                            \
+    goto End;                                                    \
+                                                                 \
+Invalid:                                                         \
+    if (InvalidIndexPointer) {                                   \
+        *InvalidIndexPointer = (Index > 0 ? Index - 1 : 0);      \
+    }                                                            \
+    if (InvalidEdgePointer) {                                    \
+        *InvalidEdgePointer = SignedEdge;                        \
+    }                                                            \
+    if (Degree1Pointer) {                                        \
+        *Degree1Pointer = Vertex1 ? (ULONG)Vertex1->Degree : 0;  \
+    }                                                            \
+    if (Degree2Pointer) {                                        \
+        *Degree2Pointer = Vertex2 ? (ULONG)Vertex2->Degree : 0;  \
+    }                                                            \
+                                                                 \
+End:                                                             \
+    if (Seen) {                                                  \
+        free(Seen);                                              \
+    }                                                            \
+    if (ScratchVertices) {                                       \
+        free(ScratchVertices);                                   \
+    }                                                            \
+    return Result;                                               \
+}
+
+DEFINE_VALIDATE_GPU_ORDER(ValidateGpuOrder16,
+                          ORDER16,
+                          EDGE16,
+                          VERTEX163,
+                          EDGE163,
+                          Order16,
+                          Vertices163,
+                          Edges163);
+
+DEFINE_VALIDATE_GPU_ORDER(ValidateGpuOrder32,
+                          ORDER,
+                          EDGE,
+                          VERTEX3,
+                          EDGE3,
+                          Order,
+                          Vertices3,
+                          Edges3);
+
 HRESULT
 GraphCuAddKeys(
     _In_ PGRAPH Graph,
@@ -1487,9 +1678,14 @@ GraphCuIsAcyclic(
 
     if (IsCudaDebugEnabled()) {
         HRESULT CpuAcyclicResult;
+        HRESULT GpuOrderValidationResult;
+        LONG InvalidEdge;
+        ULONG Degree1;
+        ULONG Degree2;
         ULONG MismatchIndex;
         ULONG CpuOrder;
         ULONG GpuOrder;
+        PCSTR Reason;
 
         CpuAcyclicResult = Graph->CpuGraph->Vtbl->IsAcyclic(Graph->CpuGraph);
         fprintf(stderr,
@@ -1548,6 +1744,44 @@ GraphCuIsAcyclic(
         if (FAILED(Result)) {
             return Result;
         }
+
+        MismatchIndex = (ULONG)-1;
+        InvalidEdge = -1;
+        Degree1 = 0;
+        Degree2 = 0;
+        Reason = "unknown";
+
+        if (IsUsingAssigned16(Graph)) {
+            GpuOrderValidationResult = ValidateGpuOrder16(Graph,
+                                                          &MismatchIndex,
+                                                          &InvalidEdge,
+                                                          &Degree1,
+                                                          &Degree2,
+                                                          &Reason);
+        } else {
+            GpuOrderValidationResult = ValidateGpuOrder32(Graph,
+                                                          &MismatchIndex,
+                                                          &InvalidEdge,
+                                                          &Degree1,
+                                                          &Degree2,
+                                                          &Reason);
+        }
+
+        if (SUCCEEDED(GpuOrderValidationResult)) {
+            fprintf(stderr,
+                    "[GraphCuIsAcyclic] GpuOrderValidationResult=0x%08x\n",
+                    (unsigned)GpuOrderValidationResult);
+        } else {
+            fprintf(stderr,
+                    "[GraphCuIsAcyclic] GpuOrderValidationResult=0x%08x "
+                    "reason=%s index=%u edge=%ld degree1=%u degree2=%u\n",
+                    (unsigned)GpuOrderValidationResult,
+                    Reason,
+                    (unsigned)MismatchIndex,
+                    (long)InvalidEdge,
+                    (unsigned)Degree1,
+                    (unsigned)Degree2);
+        }
     }
 
     //
diff --git a/tests/run_cli_chm02_cuda_known_seed_test.cmake b/tests/run_cli_chm02_cuda_known_seed_test.cmake
index ec60a144..347319a2 100644
--- a/tests/run_cli_chm02_cuda_known_seed_test.cmake
+++ b/tests/run_cli_chm02_cuda_known_seed_test.cmake
@@ -98,3 +98,10 @@ if(DEFINED REQUIRE_GPU_ASSIGN AND REQUIRE_GPU_ASSIGN)
     message(FATAL_ERROR "Expected GPU assignment path without CPU assign fallback, but CPU assign log was present.")
   endif()
 endif()
+
+if(DEFINED REQUIRE_GPU_ORDER_VALID AND REQUIRE_GPU_ORDER_VALID)
+  string(FIND "${stderr}" "[GraphCuIsAcyclic] GpuOrderValidationResult=0x00000000" gpu_order_valid_index)
+  if(gpu_order_valid_index EQUAL -1)
+    message(FATAL_ERROR "Expected GPU order validation success log, but it was not present.")
+  endif()
+endif()

From 3613919efcb54e0f678ad4cad287f179b4d3f77b Mon Sep 17 00:00:00 2001
From: Trent Nelson <trent@trent.me>
Date: Sun, 29 Mar 2026 21:03:00 -0700
Subject: [PATCH 06/14] Chm02 CUDA: add peel-order validation and GPU verify

---
 src/PerfectHash/Cu.h                          |  12 ++
 src/PerfectHash/GraphCu.c                     |  17 ++-
 src/PerfectHashCuda/Graph.cu                  | 126 ++++++++++++++++++
 src/PerfectHashCuda/PerfectHashCuda.def       |   1 +
 .../run_cli_chm02_cuda_known_seed_test.cmake  |  12 ++
 5 files changed, 167 insertions(+), 1 deletion(-)

diff --git a/src/PerfectHash/Cu.h b/src/PerfectHash/Cu.h
index 8b696653..f25fb1d1 100644
--- a/src/PerfectHash/Cu.h
+++ b/src/PerfectHash/Cu.h
@@ -2123,6 +2123,16 @@ HRESULT
     );
 typedef PERFECT_HASH_CUDA_ASSIGN *PPERFECT_HASH_CUDA_ASSIGN;
 
+typedef
+HRESULT
+(PERFECT_HASH_CUDA_VERIFY)(
+    _Inout_ struct _GRAPH *Graph,
+    _In_ ULONG BlocksPerGrid,
+    _In_ ULONG ThreadsPerBlock,
+    _In_ ULONG SharedMemoryInBytes
+    );
+typedef PERFECT_HASH_CUDA_VERIFY *PPERFECT_HASH_CUDA_VERIFY;
+
 typedef
 HRESULT
 (PERFECT_HASH_CUDA_IS_GRAPH_ACYCLIC)(
@@ -2154,6 +2164,8 @@ typedef PERFECT_HASH_CUDA_COUNT_NONEMPTY
                                                                          \
     ENTRY(ASSIGN, Assign)                                                \
                                                                          \
+    ENTRY(VERIFY, Verify)                                                \
+                                                                         \
     LAST_ENTRY(IS_GRAPH_ACYCLIC, IsAcyclic)
 
 #define PERFECT_HASH_CUDA_FUNCTION_TABLE_ENTRY(ENTRY) \
diff --git a/src/PerfectHash/GraphCu.c b/src/PerfectHash/GraphCu.c
index ac038ec9..888b038b 100644
--- a/src/PerfectHash/GraphCu.c
+++ b/src/PerfectHash/GraphCu.c
@@ -1854,9 +1854,24 @@ GraphCuVerify(
     _In_ PGRAPH Graph
     )
 {
+    PCU Cu;
     HRESULT Result;
 
-    Result = Graph->CpuGraph->Vtbl->Verify(Graph->CpuGraph);
+    if (IsCudaDebugEnabled()) {
+        fprintf(stderr, "[GraphCuVerify] Enter\n");
+    }
+
+    Cu = Graph->CuSolveContext->DeviceContext->Cu;
+
+    Result = Cu->Verify(Graph,
+                        Graph->CuBlocksPerGrid,
+                        Graph->CuThreadsPerBlock,
+                        Graph->CuSharedMemory);
+
+    if (IsCudaDebugEnabled()) {
+        fprintf(stderr, "[GraphCuVerify] GpuVerifyResult=0x%08x\n", (unsigned)Result);
+    }
+
     return Result;
 }
 
diff --git a/src/PerfectHashCuda/Graph.cu b/src/PerfectHashCuda/Graph.cu
index 86211ea9..fbbc7504 100644
--- a/src/PerfectHashCuda/Graph.cu
+++ b/src/PerfectHashCuda/Graph.cu
@@ -1037,6 +1037,56 @@ GraphCuAssignSerialKernel(
     }
 }
 
+template<typename GraphType>
+GLOBAL
+VOID
+GraphCuVerifyKernel(
+    _In_ GraphType *Graph,
+    _Inout_ uint32_t *Failures
+    )
+{
+    using AssignedType = typename GraphType::AssignedType;
+    using IndexType = typename GraphType::IndexType;
+    using KeyType = typename GraphType::KeyType;
+    using VertexPairType = typename GraphType::VertexPairType;
+
+    uint32_t Index;
+    KeyType Key;
+    KeyType *Keys;
+    VertexPairType Hash;
+    const uint32_t NumberOfKeys = Graph->NumberOfKeys;
+    const int32_t Stride = gridDim.x * blockDim.x;
+    const typename GraphType::HashVertexType HashMask =
+        (typename GraphType::HashVertexType)(Graph->NumberOfVertices - 1);
+    const IndexType IndexMask = (IndexType)(Graph->NumberOfEdges - 1);
+    auto HashFunction = GraphGetHashFunction(Graph);
+    auto *Assigned = (AssignedType *)Graph->Assigned;
+
+    Keys = (KeyType *)Graph->DeviceKeys;
+    Index = GlobalThreadIndex();
+
+    while (Index < NumberOfKeys) {
+        Key = Keys[Index];
+        Hash = HashFunction(Key, HashMask);
+
+        if (Hash.Vertex1 == Hash.Vertex2) {
+            atomicAdd(Failures, 1u);
+            Index += Stride;
+            continue;
+        }
+
+        IndexType Result = (IndexType)(
+            (Assigned[Hash.Vertex1] + Assigned[Hash.Vertex2]) & IndexMask
+        );
+
+        if ((uint32_t)Result != Index) {
+            atomicAdd(Failures, 1u);
+        }
+
+        Index += Stride;
+    }
+}
+
 EXTERN_C
 HOST
 HRESULT
@@ -1186,4 +1236,80 @@ GraphCuAssign(
     return Result;
 }
 
+EXTERN_C
+HOST
+HRESULT
+GraphCuVerify(
+    _In_ PGRAPH Graph,
+    _In_ ULONG BlocksPerGrid,
+    _In_ ULONG ThreadsPerBlock,
+    _In_ ULONG SharedMemoryInBytes
+    )
+{
+    HRESULT Result = S_OK;
+    uint32_t Failures = 0;
+    uint32_t *DeviceFailures = nullptr;
+    PGRAPH DeviceGraph;
+    CUstream_st *Stream;
+    PPH_CU_SOLVE_CONTEXT SolveContext;
+    ULONG SharedMemory = SharedMemoryInBytes;
+
+    SolveContext = Graph->CuSolveContext;
+    DeviceGraph = SolveContext->DeviceGraph;
+    Stream = (CUstream_st *)SolveContext->Stream;
+
+    if (IsUsingAssigned16(Graph)) {
+        Result = GetKernelConfig(Graph,
+                                 (PVOID)GraphCuVerifyKernel<GRAPH16>,
+                                 BlocksPerGrid,
+                                 ThreadsPerBlock,
+                                 SharedMemory);
+    } else {
+        Result = GetKernelConfig(Graph,
+                                 (PVOID)GraphCuVerifyKernel<GRAPH32>,
+                                 BlocksPerGrid,
+                                 ThreadsPerBlock,
+                                 SharedMemory);
+    }
+
+    if (FAILED(Result)) {
+        goto End;
+    }
+
+    CUDA_CALL(cudaMalloc((void **)&DeviceFailures, sizeof(*DeviceFailures)));
+    CUDA_CALL(cudaMemsetAsync(DeviceFailures, 0, sizeof(*DeviceFailures), Stream));
+
+    if (IsUsingAssigned16(Graph)) {
+        GraphCuVerifyKernel<GRAPH16><<<BlocksPerGrid,
+                                       ThreadsPerBlock,
+                                       SharedMemory,
+                                       Stream>>>((PGRAPH16)DeviceGraph,
+                                                 DeviceFailures);
+    } else {
+        GraphCuVerifyKernel<GRAPH32><<<BlocksPerGrid,
+                                       ThreadsPerBlock,
+                                       SharedMemory,
+                                       Stream>>>((PGRAPH32)DeviceGraph,
+                                                 DeviceFailures);
+    }
+
+    CUDA_CALL(cudaMemcpyAsync(&Failures,
+                              DeviceFailures,
+                              sizeof(Failures),
+                              cudaMemcpyDeviceToHost,
+                              Stream));
+    CUDA_CALL(cudaStreamSynchronize(Stream));
+
+    if (Failures > 0) {
+        Result = PH_E_TABLE_VERIFICATION_FAILED;
+    }
+
+End:
+    if (DeviceFailures) {
+        cudaFree(DeviceFailures);
+    }
+
+    return Result;
+}
+
 // vim:set ts=8 sw=4 sts=4 tw=80 expandtab filetype=cuda formatoptions=croql   :
diff --git a/src/PerfectHashCuda/PerfectHashCuda.def b/src/PerfectHashCuda/PerfectHashCuda.def
index 4b85ae06..0a5eaade 100644
--- a/src/PerfectHashCuda/PerfectHashCuda.def
+++ b/src/PerfectHashCuda/PerfectHashCuda.def
@@ -3,3 +3,4 @@ EXPORTS
     GraphCuIsAcyclic
     GraphCuAddKeys
     GraphCuAssign
+    GraphCuVerify
diff --git a/tests/run_cli_chm02_cuda_known_seed_test.cmake b/tests/run_cli_chm02_cuda_known_seed_test.cmake
index 347319a2..778f5b31 100644
--- a/tests/run_cli_chm02_cuda_known_seed_test.cmake
+++ b/tests/run_cli_chm02_cuda_known_seed_test.cmake
@@ -105,3 +105,15 @@ if(DEFINED REQUIRE_GPU_ORDER_VALID AND REQUIRE_GPU_ORDER_VALID)
     message(FATAL_ERROR "Expected GPU order validation success log, but it was not present.")
   endif()
 endif()
+
+if(DEFINED REQUIRE_GPU_VERIFY AND REQUIRE_GPU_VERIFY)
+  string(FIND "${stderr}" "[GraphCuVerify] GpuVerifyResult=0x00000000" gpu_verify_index)
+  if(gpu_verify_index EQUAL -1)
+    message(FATAL_ERROR "Expected GPU verify success log, but it was not present.")
+  endif()
+
+  string(FIND "${stderr}" "[GraphCuVerify] CpuVerifyResult=" cpu_verify_index)
+  if(NOT cpu_verify_index EQUAL -1)
+    message(FATAL_ERROR "Expected GPU verify path without CPU verify fallback, but CPU verify log was present.")
+  endif()
+endif()

From c627a5695bc09f4ec4c9d6e384c7e3037398bf1f Mon Sep 17 00:00:00 2001
From: Trent Nelson <trent@trent.me>
Date: Sun, 29 Mar 2026 21:03:23 -0700
Subject: [PATCH 07/14] Chm02 CUDA: add first-class regression tests

---
 tests/CMakeLists.txt | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 96e7bcb7..eaa6687f 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -265,6 +265,40 @@ if(TARGET PerfectHashCreateExe AND TARGET PerfectHashBulkCreateExe)
         PROPERTIES
             LABELS fast
     )
+
+    if(PERFECTHASH_HAS_CUDA)
+        add_test(
+            NAME perfecthash.cuda.chm02.hologram.nofileio
+            COMMAND ${CMAKE_COMMAND}
+                -DTEST_EXE=$<TARGET_FILE:PerfectHashCreateExe>
+                -DTEST_KEYS=${TEST_KEYS_FILE}
+                -DTEST_OUTPUT=${TEST_OUTPUT_DIR}/cuda-chm02-hologram-nofileio
+                "-DTEST_FLAGS=--NoFileIo;--DisableCsvOutputFile"
+                -DREQUIRE_GPU_ASSIGN=1
+                -DREQUIRE_GPU_ORDER_VALID=1
+                -DREQUIRE_GPU_VERIFY=1
+                -P ${CMAKE_CURRENT_SOURCE_DIR}/run_cli_chm02_cuda_known_seed_test.cmake
+        )
+
+        add_test(
+            NAME perfecthash.cuda.chm02.hologram.fileio
+            COMMAND ${CMAKE_COMMAND}
+                -DTEST_EXE=$<TARGET_FILE:PerfectHashCreateExe>
+                -DTEST_KEYS=${TEST_KEYS_FILE}
+                -DTEST_OUTPUT=${TEST_OUTPUT_DIR}/cuda-chm02-hologram-fileio
+                -DREQUIRE_GPU_ASSIGN=1
+                -DREQUIRE_GPU_ORDER_VALID=1
+                -DREQUIRE_GPU_VERIFY=1
+                -P ${CMAKE_CURRENT_SOURCE_DIR}/run_cli_chm02_cuda_known_seed_test.cmake
+        )
+
+        set_tests_properties(
+            perfecthash.cuda.chm02.hologram.nofileio
+            perfecthash.cuda.chm02.hologram.fileio
+            PROPERTIES
+                LABELS "cuda"
+        )
+    endif()
 else()
     message(STATUS "Skipping PerfectHash CLI tests for profile '${PERFECTHASH_BUILD_PROFILE_NORMALIZED}' (CLI executable targets are not available).")
 endif()

From dd7215152179beaffa25824c6aedeeff312fe1d6 Mon Sep 17 00:00:00 2001
From: Trent Nelson <trent@trent.me>
Date: Sun, 29 Mar 2026 21:03:44 -0700
Subject: [PATCH 08/14] Chm02 CUDA: add generated non-Assigned16 regression

---
 tests/CMakeLists.txt                          | 10 +++
 ..._cli_chm02_cuda_generated_33000_test.cmake | 67 +++++++++++++++++++
 2 files changed, 77 insertions(+)
 create mode 100644 tests/run_cli_chm02_cuda_generated_33000_test.cmake

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index eaa6687f..9c95ab2c 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -292,9 +292,19 @@ if(TARGET PerfectHashCreateExe AND TARGET PerfectHashBulkCreateExe)
                 -P ${CMAKE_CURRENT_SOURCE_DIR}/run_cli_chm02_cuda_known_seed_test.cmake
         )
 
+        add_test(
+            NAME perfecthash.cuda.chm02.generated33000.nofileio
+            COMMAND ${CMAKE_COMMAND}
+                -DTEST_EXE=$<TARGET_FILE:PerfectHashCreateExe>
+                -DTEST_OUTPUT=${TEST_OUTPUT_DIR}/cuda-chm02-generated33000-nofileio
+                -DTEST_PYTHON=${Python3_EXECUTABLE}
+                -P ${CMAKE_CURRENT_SOURCE_DIR}/run_cli_chm02_cuda_generated_33000_test.cmake
+        )
+
         set_tests_properties(
             perfecthash.cuda.chm02.hologram.nofileio
             perfecthash.cuda.chm02.hologram.fileio
+            perfecthash.cuda.chm02.generated33000.nofileio
             PROPERTIES
                 LABELS "cuda"
         )
diff --git a/tests/run_cli_chm02_cuda_generated_33000_test.cmake b/tests/run_cli_chm02_cuda_generated_33000_test.cmake
new file mode 100644
index 00000000..17f53e40
--- /dev/null
+++ b/tests/run_cli_chm02_cuda_generated_33000_test.cmake
@@ -0,0 +1,67 @@
+if(NOT DEFINED TEST_EXE)
+  message(FATAL_ERROR "TEST_EXE is required")
+endif()
+if(NOT DEFINED TEST_OUTPUT)
+  message(FATAL_ERROR "TEST_OUTPUT is required")
+endif()
+if(NOT DEFINED TEST_PYTHON)
+  message(FATAL_ERROR "TEST_PYTHON is required")
+endif()
+
+set(test_output "${TEST_OUTPUT}")
+file(MAKE_DIRECTORY "${test_output}")
+
+set(generator_script "${test_output}/generate_cuda_33000_keys.py")
+file(WRITE "${generator_script}" "import struct\n")
+file(APPEND "${generator_script}" "from pathlib import Path\n")
+file(APPEND "${generator_script}" "out = Path(r'''${test_output}''')\n")
+file(APPEND "${generator_script}" "out.mkdir(parents=True, exist_ok=True)\n")
+file(APPEND "${generator_script}" "count = 33000\n")
+file(APPEND "${generator_script}" "salt = 0x13579BDF\n")
+file(APPEND "${generator_script}" "multiplier = 2654435761\n")
+file(APPEND "${generator_script}" "keys = []\n")
+file(APPEND "${generator_script}" "i = 1\n")
+file(APPEND "${generator_script}" "while len(keys) < count:\n")
+file(APPEND "${generator_script}" "    value = ((i * multiplier) ^ salt) & 0xffffffff\n")
+file(APPEND "${generator_script}" "    if value != 0 and value != 0xffffffff:\n")
+file(APPEND "${generator_script}" "        keys.append(value)\n")
+file(APPEND "${generator_script}" "    i += 1\n")
+file(APPEND "${generator_script}" "keys.sort()\n")
+file(APPEND "${generator_script}" "(out / 'generated-33000.keys').write_bytes(b''.join(struct.pack('<I', v) for v in keys))\n")
+
+execute_process(
+  COMMAND "${TEST_PYTHON}" "${generator_script}"
+  RESULT_VARIABLE generate_result
+  OUTPUT_VARIABLE generate_stdout
+  ERROR_VARIABLE generate_stderr
+)
+
+if(NOT generate_result EQUAL 0)
+  message(STATUS "stdout: ${generate_stdout}")
+  message(STATUS "stderr: ${generate_stderr}")
+  message(FATAL_ERROR "Failed to generate 33000-key fixture (exit ${generate_result})")
+endif()
+
+set(generated_keys "${test_output}/generated-33000.keys")
+set(run_output "${test_output}/run")
+
+execute_process(
+  COMMAND ${CMAKE_COMMAND}
+    -DTEST_EXE=${TEST_EXE}
+    -DTEST_KEYS=${generated_keys}
+    -DTEST_OUTPUT=${run_output}
+    -DTEST_SEEDS=--Seeds=0x4FB8349A,0x64E0E8DD,0x1700010F,0xF7F80AD9
+    -DREQUIRE_GPU_ASSIGN=1
+    -DREQUIRE_GPU_ORDER_VALID=1
+    -DREQUIRE_GPU_VERIFY=1
+    -P ${CMAKE_CURRENT_LIST_DIR}/run_cli_chm02_cuda_known_seed_test.cmake
+  RESULT_VARIABLE run_result
+  OUTPUT_VARIABLE run_stdout
+  ERROR_VARIABLE run_stderr
+)
+
+if(NOT run_result EQUAL 0)
+  message(STATUS "stdout: ${run_stdout}")
+  message(STATUS "stderr: ${run_stderr}")
+  message(FATAL_ERROR "Generated 33000-key CUDA regression failed (exit ${run_result})")
+endif()

From 03f0b1030ecef8b3d4e79d5b466e739f9f02f4a4 Mon Sep 17 00:00:00 2001
From: Trent Nelson <trent@trent.me>
Date: Sun, 29 Mar 2026 21:04:06 -0700
Subject: [PATCH 09/14] Chm02 CUDA: add explicit timing surface

---
 src/PerfectHash/BulkCreateCsv.h               | 16 +++++
 src/PerfectHash/Chm02.c                       | 11 +++
 src/PerfectHash/Chm02Compat.c                 | 11 +++
 src/PerfectHash/Graph.h                       | 10 +++
 src/PerfectHash/GraphCu.c                     | 61 ++++++++++++++++
 src/PerfectHash/PerfectHashCompat.c           |  2 +-
 src/PerfectHash/PerfectHashTable.h            |  5 ++
 src/PerfectHash/TableCreateCsv.h              | 16 +++++
 tests/CMakeLists.txt                          | 11 +++
 tests/run_cli_chm02_cuda_perf_benchmark.cmake | 72 +++++++++++++++++++
 10 files changed, 214 insertions(+), 1 deletion(-)
 create mode 100644 tests/run_cli_chm02_cuda_perf_benchmark.cmake

diff --git a/src/PerfectHash/BulkCreateCsv.h b/src/PerfectHash/BulkCreateCsv.h
index 3dd73c83..dbf876e9 100644
--- a/src/PerfectHash/BulkCreateCsv.h
+++ b/src/PerfectHash/BulkCreateCsv.h
@@ -424,6 +424,22 @@ Module Name:
           Context->VerifyElapsedMicroseconds.QuadPart,                                       \
           OUTPUT_INT)                                                                        \
                                                                                              \
+    ENTRY(CuAddKeysMicroseconds,                                                             \
+          Table->CuAddKeysElapsedMicroseconds.QuadPart,                                      \
+          OUTPUT_INT)                                                                        \
+                                                                                             \
+    ENTRY(CuIsAcyclicMicroseconds,                                                           \
+          Table->CuIsAcyclicElapsedMicroseconds.QuadPart,                                    \
+          OUTPUT_INT)                                                                        \
+                                                                                             \
+    ENTRY(CuAssignMicroseconds,                                                              \
+          Table->CuAssignElapsedMicroseconds.QuadPart,                                       \
+          OUTPUT_INT)                                                                        \
+                                                                                             \
+    ENTRY(CuVerifyMicroseconds,                                                              \
+          Table->CuVerifyElapsedMicroseconds.QuadPart,                                       \
+          OUTPUT_INT)                                                                        \
+                                                                                             \
     ENTRY(BenchmarkWarmups,                                                                  \
           Table->BenchmarkWarmups,                                                           \
           OUTPUT_INT)                                                                        \
diff --git a/src/PerfectHash/Chm02.c b/src/PerfectHash/Chm02.c
index 0fda6299..93689690 100644
--- a/src/PerfectHash/Chm02.c
+++ b/src/PerfectHash/Chm02.c
@@ -709,6 +709,17 @@ Return Value:
 
     COPY_GRAPH_COUNTERS_FROM_GRAPH_TO_TABLE();
 
+#ifdef PH_USE_CUDA
+    Table->CuAddKeysElapsedMicroseconds.QuadPart =
+        Graph->CuAddKeysElapsedMicroseconds.QuadPart;
+    Table->CuIsAcyclicElapsedMicroseconds.QuadPart =
+        Graph->CuIsAcyclicElapsedMicroseconds.QuadPart;
+    Table->CuAssignElapsedMicroseconds.QuadPart =
+        Graph->CuAssignElapsedMicroseconds.QuadPart;
+    Table->CuVerifyElapsedMicroseconds.QuadPart =
+        Graph->CuVerifyElapsedMicroseconds.QuadPart;
+#endif
+
     //
     // Note this graph as the one solved to the context.  This is used by the
     // save file work callback we dispatch below.
diff --git a/src/PerfectHash/Chm02Compat.c b/src/PerfectHash/Chm02Compat.c
index 67c61732..a3c7f3e2 100644
--- a/src/PerfectHash/Chm02Compat.c
+++ b/src/PerfectHash/Chm02Compat.c
@@ -700,6 +700,17 @@ Return Value:
 
     COPY_GRAPH_COUNTERS_FROM_GRAPH_TO_TABLE();
 
+#ifdef PH_USE_CUDA
+    Table->CuAddKeysElapsedMicroseconds.QuadPart =
+        Graph->CuAddKeysElapsedMicroseconds.QuadPart;
+    Table->CuIsAcyclicElapsedMicroseconds.QuadPart =
+        Graph->CuIsAcyclicElapsedMicroseconds.QuadPart;
+    Table->CuAssignElapsedMicroseconds.QuadPart =
+        Graph->CuAssignElapsedMicroseconds.QuadPart;
+    Table->CuVerifyElapsedMicroseconds.QuadPart =
+        Graph->CuVerifyElapsedMicroseconds.QuadPart;
+#endif
+
     //
     // Capture RNG details from the winning graph if the RNG used was not the
     // System one.
diff --git a/src/PerfectHash/Graph.h b/src/PerfectHash/Graph.h
index 86a46296..9ba73cf6 100644
--- a/src/PerfectHash/Graph.h
+++ b/src/PerfectHash/Graph.h
@@ -1874,6 +1874,15 @@ typedef struct _Struct_size_bytes_(SizeOfStruct) _GRAPH {
 
     ULONG CuIsAcyclicPhase1Attempts;
 
+    //
+    // Explicit CUDA phase timings for performance benchmarking.
+    //
+
+    LARGE_INTEGER CuAddKeysElapsedMicroseconds;
+    LARGE_INTEGER CuIsAcyclicElapsedMicroseconds;
+    LARGE_INTEGER CuAssignElapsedMicroseconds;
+    LARGE_INTEGER CuVerifyElapsedMicroseconds;
+
     //
     // CUDA RNG details.
     //
@@ -2157,6 +2166,7 @@ extern GRAPH_REGISTER_SOLVED GraphRegisterSolved;
 #ifdef _M_X64
 extern GRAPH_REGISTER_SOLVED GraphRegisterSolvedTsx;
 #endif
+
 extern GRAPH_SHOULD_WE_CONTINUE_TRYING_TO_SOLVE
     GraphShouldWeContinueTryingToSolve;
 extern GRAPH_ADD_KEYS GraphAddKeys;
diff --git a/src/PerfectHash/GraphCu.c b/src/PerfectHash/GraphCu.c
index 888b038b..1442ac86 100644
--- a/src/PerfectHash/GraphCu.c
+++ b/src/PerfectHash/GraphCu.c
@@ -1081,6 +1081,10 @@ Return Value:
     Graph->TraversalDepth = 0;
     Graph->TotalTraversals = 0;
     Graph->MaximumTraversalDepth = 0;
+    Graph->CuAddKeysElapsedMicroseconds.QuadPart = 0;
+    Graph->CuIsAcyclicElapsedMicroseconds.QuadPart = 0;
+    Graph->CuAssignElapsedMicroseconds.QuadPart = 0;
+    Graph->CuVerifyElapsedMicroseconds.QuadPart = 0;
 
     Graph->SolvedTime.AsULongLong = 0;
 
@@ -1330,6 +1334,23 @@ IsCudaDebugEnabled(
 #endif
 }
 
+FORCEINLINE
+VOID
+CaptureCuElapsedMicroseconds(
+    _In_ PGRAPH Graph,
+    _In_ LARGE_INTEGER Start,
+    _In_ LARGE_INTEGER End,
+    _Out_ PLARGE_INTEGER ElapsedMicroseconds
+    )
+{
+    LONGLONG Cycles;
+
+    Cycles = End.QuadPart - Start.QuadPart;
+    ElapsedMicroseconds->QuadPart = (
+        (Cycles * 1000000) / Graph->Context->Frequency.QuadPart
+    );
+}
+
 #define DEFINE_VALIDATE_GPU_ORDER(Name,                               \
                                   OrderType,                          \
                                   EdgeType,                           \
@@ -1528,6 +1549,8 @@ GraphCuAddKeys(
     _In_reads_(NumberOfKeys) PKEY Keys
     )
 {
+    LARGE_INTEGER Start;
+    LARGE_INTEGER End;
     PCU Cu;
     HRESULT Result;
 
@@ -1541,11 +1564,19 @@ GraphCuAddKeys(
 
     Cu = Graph->CuSolveContext->DeviceContext->Cu;
 
+    QueryPerformanceCounter(&Start);
+
     Result = Cu->AddKeys(Graph,
                          Graph->CuBlocksPerGrid,
                          Graph->CuThreadsPerBlock,
                          Graph->CuSharedMemory);
 
+    QueryPerformanceCounter(&End);
+    CaptureCuElapsedMicroseconds(Graph,
+                                 Start,
+                                 End,
+                                 &Graph->CuAddKeysElapsedMicroseconds);
+
     if (IsCudaDebugEnabled()) {
         fprintf(stderr,
                 "[GraphCuAddKeys] Result=0x%08x HashKeysResult=0x%08x "
@@ -1564,6 +1595,8 @@ GraphCuIsAcyclic(
     _In_ PGRAPH Graph
     )
 {
+    LARGE_INTEGER Start;
+    LARGE_INTEGER End;
     PCU Cu;
     PRTL Rtl;
     PKEY Keys;
@@ -1584,6 +1617,8 @@ GraphCuIsAcyclic(
     SolveContext = Graph->CuSolveContext;
     Cu = SolveContext->DeviceContext->Cu;
 
+    QueryPerformanceCounter(&Start);
+
     Result = Cu->IsAcyclic(Graph,
                            Graph->CuBlocksPerGrid,
                            Graph->CuThreadsPerBlock,
@@ -1809,6 +1844,12 @@ GraphCuIsAcyclic(
 
     InterlockedIncrement64(&Context->GpuAndCpuIsAcyclicSuccess);
 
+    QueryPerformanceCounter(&End);
+    CaptureCuElapsedMicroseconds(Graph,
+                                 Start,
+                                 End,
+                                 &Graph->CuIsAcyclicElapsedMicroseconds);
+
     return Result;
 }
 
@@ -1817,6 +1858,8 @@ GraphCuAssign(
     _In_ PGRAPH Graph
     )
 {
+    LARGE_INTEGER Start;
+    LARGE_INTEGER End;
     PCU Cu;
     HRESULT Result;
 
@@ -1830,6 +1873,8 @@ GraphCuAssign(
 
     Cu = Graph->CuSolveContext->DeviceContext->Cu;
 
+    QueryPerformanceCounter(&Start);
+
     Result = Cu->Assign(Graph,
                         Graph->CuBlocksPerGrid,
                         Graph->CuThreadsPerBlock,
@@ -1846,6 +1891,12 @@ GraphCuAssign(
                Graph->Assigned,
                Graph->Info->AssignedSizeInBytes);
 
+    QueryPerformanceCounter(&End);
+    CaptureCuElapsedMicroseconds(Graph,
+                                 Start,
+                                 End,
+                                 &Graph->CuAssignElapsedMicroseconds);
+
     return Result;
 }
 
@@ -1854,6 +1905,8 @@ GraphCuVerify(
     _In_ PGRAPH Graph
     )
 {
+    LARGE_INTEGER Start;
+    LARGE_INTEGER End;
     PCU Cu;
     HRESULT Result;
 
@@ -1863,11 +1916,19 @@ GraphCuVerify(
 
     Cu = Graph->CuSolveContext->DeviceContext->Cu;
 
+    QueryPerformanceCounter(&Start);
+
     Result = Cu->Verify(Graph,
                         Graph->CuBlocksPerGrid,
                         Graph->CuThreadsPerBlock,
                         Graph->CuSharedMemory);
 
+    QueryPerformanceCounter(&End);
+    CaptureCuElapsedMicroseconds(Graph,
+                                 Start,
+                                 End,
+                                 &Graph->CuVerifyElapsedMicroseconds);
+
     if (IsCudaDebugEnabled()) {
         fprintf(stderr, "[GraphCuVerify] GpuVerifyResult=0x%08x\n", (unsigned)Result);
     }
diff --git a/src/PerfectHash/PerfectHashCompat.c b/src/PerfectHash/PerfectHashCompat.c
index cc63c7f8..ddb88019 100644
--- a/src/PerfectHash/PerfectHashCompat.c
+++ b/src/PerfectHash/PerfectHashCompat.c
@@ -615,7 +615,7 @@ QueryPerformanceFrequency(
     _Out_ LARGE_INTEGER* lpFrequency
     )
 {
-    lpFrequency->QuadPart = 1000;
+    lpFrequency->QuadPart = 1000000000ull;
     return TRUE;
 }
 
diff --git a/src/PerfectHash/PerfectHashTable.h b/src/PerfectHash/PerfectHashTable.h
index 21483401..33d516d5 100644
--- a/src/PerfectHash/PerfectHashTable.h
+++ b/src/PerfectHash/PerfectHashTable.h
@@ -870,6 +870,11 @@ typedef struct _Struct_size_bytes_(SizeOfStruct) _PERFECT_HASH_TABLE {
 
     DECL_GRAPH_COUNTERS_WITHIN_STRUCT();
 
+    LARGE_INTEGER CuAddKeysElapsedMicroseconds;
+    LARGE_INTEGER CuIsAcyclicElapsedMicroseconds;
+    LARGE_INTEGER CuAssignElapsedMicroseconds;
+    LARGE_INTEGER CuVerifyElapsedMicroseconds;
+
     //
     // Rng details from the winning graph.
     //
diff --git a/src/PerfectHash/TableCreateCsv.h b/src/PerfectHash/TableCreateCsv.h
index 8d4417ac..b1b5d828 100644
--- a/src/PerfectHash/TableCreateCsv.h
+++ b/src/PerfectHash/TableCreateCsv.h
@@ -432,6 +432,22 @@ Module Name:
           Context->VerifyElapsedMicroseconds.QuadPart,                                       \
           OUTPUT_INT)                                                                        \
                                                                                              \
+    ENTRY(CuAddKeysMicroseconds,                                                             \
+          Table->CuAddKeysElapsedMicroseconds.QuadPart,                                      \
+          OUTPUT_INT)                                                                        \
+                                                                                             \
+    ENTRY(CuIsAcyclicMicroseconds,                                                           \
+          Table->CuIsAcyclicElapsedMicroseconds.QuadPart,                                    \
+          OUTPUT_INT)                                                                        \
+                                                                                             \
+    ENTRY(CuAssignMicroseconds,                                                              \
+          Table->CuAssignElapsedMicroseconds.QuadPart,                                       \
+          OUTPUT_INT)                                                                        \
+                                                                                             \
+    ENTRY(CuVerifyMicroseconds,                                                              \
+          Table->CuVerifyElapsedMicroseconds.QuadPart,                                       \
+          OUTPUT_INT)                                                                        \
+                                                                                             \
     ENTRY(BenchmarkWarmups,                                                                  \
           Table->BenchmarkWarmups,                                                           \
           OUTPUT_INT)                                                                        \
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 9c95ab2c..f4a836f4 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -301,10 +301,21 @@ if(TARGET PerfectHashCreateExe AND TARGET PerfectHashBulkCreateExe)
                 -P ${CMAKE_CURRENT_SOURCE_DIR}/run_cli_chm02_cuda_generated_33000_test.cmake
         )
 
+        add_test(
+            NAME perfecthash.cuda.chm02.perf-surface
+            COMMAND ${CMAKE_COMMAND}
+                -DTEST_EXE=$<TARGET_FILE:PerfectHashCreateExe>
+                -DTEST_KEYS=${TEST_KEYS_FILE}
+                -DTEST_OUTPUT=${TEST_OUTPUT_DIR}/cuda-chm02-perf-surface
+                -DTEST_PYTHON=${Python3_EXECUTABLE}
+                -P ${CMAKE_CURRENT_SOURCE_DIR}/run_cli_chm02_cuda_perf_benchmark.cmake
+        )
+
         set_tests_properties(
             perfecthash.cuda.chm02.hologram.nofileio
             perfecthash.cuda.chm02.hologram.fileio
             perfecthash.cuda.chm02.generated33000.nofileio
+            perfecthash.cuda.chm02.perf-surface
             PROPERTIES
                 LABELS "cuda"
         )
diff --git a/tests/run_cli_chm02_cuda_perf_benchmark.cmake b/tests/run_cli_chm02_cuda_perf_benchmark.cmake
new file mode 100644
index 00000000..1bf4fbba
--- /dev/null
+++ b/tests/run_cli_chm02_cuda_perf_benchmark.cmake
@@ -0,0 +1,72 @@
+if(NOT DEFINED TEST_EXE)
+  message(FATAL_ERROR "TEST_EXE is required")
+endif()
+if(NOT DEFINED TEST_KEYS)
+  message(FATAL_ERROR "TEST_KEYS is required")
+endif()
+if(NOT DEFINED TEST_OUTPUT)
+  message(FATAL_ERROR "TEST_OUTPUT is required")
+endif()
+if(NOT DEFINED TEST_PYTHON)
+  message(FATAL_ERROR "TEST_PYTHON is required")
+endif()
+
+file(TO_NATIVE_PATH "${TEST_EXE}" test_exe_native)
+file(TO_NATIVE_PATH "${TEST_KEYS}" test_keys_native)
+file(TO_NATIVE_PATH "${TEST_OUTPUT}" test_output_native)
+file(REMOVE_RECURSE "${test_output_native}")
+file(MAKE_DIRECTORY "${test_output_native}")
+
+set(test_args
+  Chm02
+  Mulshrolate3RX
+  And
+  1
+  --CuConcurrency=1
+  --FixedAttempts=2
+  --Seeds=0xF0192B55,0xD9C83970,0x0C1E0D10,0xD11A5847
+  --NoFileIo
+  --SkipTestAfterCreate
+)
+
+execute_process(
+  COMMAND "${test_exe_native}" "${test_keys_native}" "${test_output_native}" ${test_args}
+  RESULT_VARIABLE result
+  OUTPUT_VARIABLE stdout
+  ERROR_VARIABLE stderr
+)
+
+if(NOT result EQUAL 0)
+  message(STATUS "stdout: ${stdout}")
+  message(STATUS "stderr: ${stderr}")
+  message(FATAL_ERROR "Command failed with exit code ${result}")
+endif()
+
+file(GLOB csv_files "${test_output_native}/*.csv")
+list(LENGTH csv_files csv_count)
+if(NOT csv_count EQUAL 1)
+  message(FATAL_ERROR "Expected exactly one CSV output file, found ${csv_count}")
+endif()
+
+set(csv_path "${csv_files}")
+
+execute_process(
+  COMMAND "${TEST_PYTHON}" -c
+          "import csv, sys; \
+path = sys.argv[1]; \
+required = ['CuAddKeysMicroseconds', 'CuIsAcyclicMicroseconds', 'CuAssignMicroseconds', 'CuVerifyMicroseconds']; \
+row = next(csv.DictReader(open(path, newline=''))); \
+missing = [name for name in required if name not in row]; \
+assert not missing, f'Missing timing field(s): {missing}'; \
+negative = [name for name in required if int(row[name]) < 0]; \
+assert not negative, f'Negative timing field(s): {negative}'" "${csv_path}"
+  RESULT_VARIABLE parse_result
+  OUTPUT_VARIABLE parse_stdout
+  ERROR_VARIABLE parse_stderr
+)
+
+if(NOT parse_result EQUAL 0)
+  message(STATUS "stdout: ${parse_stdout}")
+  message(STATUS "stderr: ${parse_stderr}")
+  message(FATAL_ERROR "CSV timing validation failed (exit ${parse_result})")
+endif()

From ccdcd5ef10fdb6e0df1a6f4c035b7d5a24839c45 Mon Sep 17 00:00:00 2001
From: Trent Nelson <trent@trent.me>
Date: Sun, 29 Mar 2026 22:04:17 -0700
Subject: [PATCH 10/14] Chm02 CUDA: address review findings

---
 docs/chm02-cuda-mainline.md                   | 67 +++++++++++++++++++
 src/PerfectHash/BulkCreateBestCsv.h           |  6 +-
 src/PerfectHash/BulkCreateCsv.h               |  6 +-
 src/PerfectHash/Chm02.c                       | 15 +----
 src/PerfectHash/Graph.c                       | 24 ++-----
 src/PerfectHash/Graph.h                       | 10 ++-
 src/PerfectHash/GraphCu.c                     | 27 ++------
 src/PerfectHash/PerfectHashPrivate.h          | 13 ++++
 src/PerfectHash/TableCreateBestCsv.h          |  6 +-
 src/PerfectHash/TableCreateCsv.h              |  6 +-
 src/PerfectHashCuda/Graph.cu                  | 10 +--
 .../run_cli_chm02_cuda_known_seed_test.cmake  | 10 ---
 12 files changed, 110 insertions(+), 90 deletions(-)
 create mode 100644 docs/chm02-cuda-mainline.md

diff --git a/docs/chm02-cuda-mainline.md b/docs/chm02-cuda-mainline.md
new file mode 100644
index 00000000..26aaf558
--- /dev/null
+++ b/docs/chm02-cuda-mainline.md
@@ -0,0 +1,67 @@
+# Chm02 CUDA Mainline Note
+
+## Summary
+
+This note captures the intent of the `issue-79-chm02-cuda-mainline` branch.
+
+The branch promotes the legacy `Chm02` CUDA path from a CPU-assisted bring-up
+ state toward a first-class correctness path by moving the major solve phases
+ (`IsAcyclic`, `Assign`, `Verify`) onto the GPU while keeping CPU-oracle-style
+ validation and debugging support available during bring-up.
+
+## Goals
+
+- Fix correctness blockers in the existing `Chm02` CUDA path.
+- Make known-seed CLI runs succeed on Linux in both no-file-io and file-io
+  configurations.
+- Add regression coverage for:
+  - known-seed `Chm02` CUDA runs
+  - a generated non-`Assigned16` case
+  - timing-field presence
+- Expose explicit per-phase CUDA timing fields for measurement.
+
+## Non-Goals
+
+- High-throughput GPU solving.
+- Batched multi-attempt GPU construction.
+- Replacing the standalone GPU peeling POC.
+- Eliminating all CPU-oracle/debug-only code from the branch.
+
+The current `Chm02` CUDA implementation remains correctness-first, not
+ throughput-first.
+
+## Supported Scope
+
+- Algorithm: `Chm02`
+- Hash path: known-good seeded hash families already supported by the repo
+- CUDA path: single-graph bring-up / validation
+- Platform focus:
+  - Linux with CUDA enabled
+  - existing regression coverage on the configured CUDA host
+
+## Fallback / Debugging Policy
+
+- Normal operation should use the GPU path for add-keys, acyclic detection,
+  assignment, and verify.
+- CPU-oracle and order-validation logic is intended as bring-up/debug support.
+- `PH_DEBUG_CUDA_CHM02` enables extra logging and validation details for
+  troubleshooting.
+
+## Timing Contract
+
+The following CSV fields are emitted:
+
+- `CuAddKeysMicroseconds`
+- `CuIsAcyclicMicroseconds`
+- `CuAssignMicroseconds`
+- `CuVerifyMicroseconds`
+
+These are synchronized phase timings around the CUDA-backed phase wrappers, not
+ raw kernel-only device timings.
+
+## Acceptance
+
+- The focused CUDA `Chm02` regression tests pass when CUDA is enabled.
+- Known-seed `Chm02` CUDA runs succeed on Linux.
+- File-io and no-file-io paths both work in the covered scenarios.
+- Timing fields are present and non-negative in CSV output.
diff --git a/src/PerfectHash/BulkCreateBestCsv.h b/src/PerfectHash/BulkCreateBestCsv.h
index 13e8019c..ee57f8c9 100644
--- a/src/PerfectHash/BulkCreateBestCsv.h
+++ b/src/PerfectHash/BulkCreateBestCsv.h
@@ -280,16 +280,12 @@ Module Name:
           Context->GpuAddKeysSuccessButCpuAddKeysFailures,                                   \
           OUTPUT_INT)                                                                        \
                                                                                              \
-    ENTRY(GpuIsAcyclicButCpuIsCyclicFailures,                                                \
-          Context->GpuIsAcyclicButCpuIsCyclicFailures,                                       \
-          OUTPUT_INT)                                                                        \
-                                                                                             \
     ENTRY(GpuAndCpuAddKeysSuccess,                                                           \
           Context->GpuAndCpuAddKeysSuccess,                                                  \
           OUTPUT_INT)                                                                        \
                                                                                              \
     ENTRY(GpuAndCpuIsAcyclicSuccess,                                                         \
-          Context->GpuAndCpuAddKeysSuccess,                                                  \
+          Context->GpuAndCpuIsAcyclicSuccess,                                                \
           OUTPUT_INT)                                                                        \
                                                                                              \
     ENTRY(BestCoverageAttempts,                                                              \
diff --git a/src/PerfectHash/BulkCreateCsv.h b/src/PerfectHash/BulkCreateCsv.h
index dbf876e9..2871fc87 100644
--- a/src/PerfectHash/BulkCreateCsv.h
+++ b/src/PerfectHash/BulkCreateCsv.h
@@ -279,16 +279,12 @@ Module Name:
           Context->GpuAddKeysSuccessButCpuAddKeysFailures,                                   \
           OUTPUT_INT)                                                                        \
                                                                                              \
-    ENTRY(GpuIsAcyclicButCpuIsCyclicFailures,                                                \
-          Context->GpuIsAcyclicButCpuIsCyclicFailures,                                       \
-          OUTPUT_INT)                                                                        \
-                                                                                             \
     ENTRY(GpuAndCpuAddKeysSuccess,                                                           \
           Context->GpuAndCpuAddKeysSuccess,                                                  \
           OUTPUT_INT)                                                                        \
                                                                                              \
     ENTRY(GpuAndCpuIsAcyclicSuccess,                                                         \
-          Context->GpuAndCpuAddKeysSuccess,                                                  \
+          Context->GpuAndCpuIsAcyclicSuccess,                                                \
           OUTPUT_INT)                                                                        \
                                                                                              \
     ENTRY(BestCoverageAttempts,                                                              \
diff --git a/src/PerfectHash/Chm02.c b/src/PerfectHash/Chm02.c
index 93689690..242a58fe 100644
--- a/src/PerfectHash/Chm02.c
+++ b/src/PerfectHash/Chm02.c
@@ -17,19 +17,6 @@ Module Name:
 #include "Chm01.h"
 #include "Chm02Private.h"
 
-FORCEINLINE
-BOOLEAN
-IsChm02CudaDebugEnabled(
-    VOID
-    )
-{
-#ifdef PH_WINDOWS
-    return (GetEnvironmentVariableA("PH_DEBUG_CUDA_CHM02", NULL, 0) > 0);
-#else
-    return (getenv("PH_DEBUG_CUDA_CHM02") != NULL);
-#endif
-}
-
 //
 // Main table creation implementation routine for Chm02.
 //
@@ -196,7 +183,7 @@ Return Value:
 
     TableCreateFlags.AsULongLong = Table->TableCreateFlags.AsULongLong;
     Silent = (TableCreateFlags.Silent != FALSE);
-    DebugCuda = IsChm02CudaDebugEnabled();
+    DebugCuda = IsPerfectHashCudaDebugEnabled();
 
     //
     // Initialize variables used if we jump to Error early-on.
diff --git a/src/PerfectHash/Graph.c b/src/PerfectHash/Graph.c
index bd35725b..17d1adb7 100644
--- a/src/PerfectHash/Graph.c
+++ b/src/PerfectHash/Graph.c
@@ -555,13 +555,7 @@ Return Value:
     Table = Context->Table;
     NumberOfKeys = Table->Keys->NumberOfKeys.LowPart;
     Edges = Keys = (PKEY)Table->Keys->KeyArrayBaseAddress;
-    DebugCudaSolve = FALSE;
-
-#ifdef PH_WINDOWS
-    DebugCudaSolve = (GetEnvironmentVariableA("PH_DEBUG_CUDA_CHM02", NULL, 0) > 0);
-#else
-    DebugCudaSolve = (getenv("PH_DEBUG_CUDA_CHM02") != NULL);
-#endif
+    DebugCudaSolve = (IsCuGraph(Graph) ? IsCudaDebugGraph(Graph) : FALSE);
 
     //
     // Attempt to add all the keys to the graph.
@@ -4017,12 +4011,8 @@ Return Value:
         return E_POINTER;
     }
 
-    DebugCudaSolve = FALSE;
-#ifdef PH_WINDOWS
-    DebugCudaSolve = (GetEnvironmentVariableA("PH_DEBUG_CUDA_CHM02", NULL, 0) > 0);
-#else
-    DebugCudaSolve = (getenv("PH_DEBUG_CUDA_CHM02") != NULL);
-#endif
+    DebugCudaSolve = (IsCuGraph(Graph) ? IsPerfectHashCudaDebugEnabled() : FALSE);
+    Graph->Flags.DebugCudaChm02 = DebugCudaSolve;
 
     //
     // Acquire the exclusive graph lock for the duration of the routine.  The
@@ -6975,13 +6965,7 @@ Return Value:
     CoverageType = Context->BestCoverageType;
     Attempt = Coverage->Attempt;
     ElapsedMilliseconds = GetTickCount64() - Context->StartMilliseconds;
-    DebugCudaSolve = FALSE;
-
-#ifdef PH_WINDOWS
-    DebugCudaSolve = (GetEnvironmentVariableA("PH_DEBUG_CUDA_CHM02", NULL, 0) > 0);
-#else
-    DebugCudaSolve = (getenv("PH_DEBUG_CUDA_CHM02") != NULL);
-#endif
+    DebugCudaSolve = (IsCuGraph(Graph) ? IsCudaDebugGraph(Graph) : FALSE);
 
     if (DebugCudaSolve && IsCuGraph(Graph)) {
         fprintf(stderr,
diff --git a/src/PerfectHash/Graph.h b/src/PerfectHash/Graph.h
index 9ba73cf6..884452e4 100644
--- a/src/PerfectHash/Graph.h
+++ b/src/PerfectHash/Graph.h
@@ -906,11 +906,18 @@ typedef union _GRAPH_FLAGS {
 
         ULONG HasUserSeeds:1;
 
+        //
+        // When set, enables PH_DEBUG_CUDA_CHM02 bring-up logging for this
+        // graph without re-reading the environment variable in hot paths.
+        //
+
+        ULONG DebugCudaChm02:1;
+
         //
         // Unused bits.
         //
 
-        ULONG Unused:10;
+        ULONG Unused:9;
     };
     LONG AsLong;
     ULONG AsULong;
@@ -935,6 +942,7 @@ C_ASSERT(sizeof(GRAPH_FLAGS) == sizeof(ULONG));
     ((Graph)->Flags.WantsCuRandomHostSeeds != FALSE)
 #define IsGraphParanoid(Graph) ((Graph)->Flags.Paranoid != FALSE)
 #define IsUsingAssigned16(Graph) ((Graph)->Flags.UsingAssigned16 != FALSE)
+#define IsCudaDebugGraph(Graph) ((Graph)->Flags.DebugCudaChm02 != FALSE)
 
 #define SetSpareGraph(Graph) (Graph->Flags.IsSpareGraph = TRUE)
 #define SetSpareCuGraph(Graph) (Graph->Flags.IsSpareCuGraph = TRUE)
diff --git a/src/PerfectHash/GraphCu.c b/src/PerfectHash/GraphCu.c
index 1442ac86..2e13d422 100644
--- a/src/PerfectHash/GraphCu.c
+++ b/src/PerfectHash/GraphCu.c
@@ -1321,19 +1321,6 @@ GraphCuLoadNewSeeds(
     return Result;
 }
 
-FORCEINLINE
-BOOLEAN
-IsCudaDebugEnabled(
-    VOID
-    )
-{
-#ifdef PH_WINDOWS
-    return (GetEnvironmentVariableA("PH_DEBUG_CUDA_CHM02", NULL, 0) > 0);
-#else
-    return (getenv("PH_DEBUG_CUDA_CHM02") != NULL);
-#endif
-}
-
 FORCEINLINE
 VOID
 CaptureCuElapsedMicroseconds(
@@ -1577,7 +1564,7 @@ GraphCuAddKeys(
                                  End,
                                  &Graph->CuAddKeysElapsedMicroseconds);
 
-    if (IsCudaDebugEnabled()) {
+    if (IsCudaDebugGraph(Graph)) {
         fprintf(stderr,
                 "[GraphCuAddKeys] Result=0x%08x HashKeysResult=0x%08x "
                 "VertexFailures=%u WarpFailures=%u\n",
@@ -1624,7 +1611,7 @@ GraphCuIsAcyclic(
                            Graph->CuThreadsPerBlock,
                            Graph->CuSharedMemory);
 
-    if (IsCudaDebugEnabled()) {
+    if (IsCudaDebugGraph(Graph)) {
         fprintf(stderr,
                 "[GraphCuIsAcyclic] GpuResult=0x%08x Attempts=%u "
                 "DeletedEdges=%u OrderIndex=%ld\n",
@@ -1711,7 +1698,7 @@ GraphCuIsAcyclic(
         InterlockedIncrement64(&Context->GpuAndCpuAddKeysSuccess);
     }
 
-    if (IsCudaDebugEnabled()) {
+        if (IsCudaDebugGraph(Graph)) {
         HRESULT CpuAcyclicResult;
         HRESULT GpuOrderValidationResult;
         LONG InvalidEdge;
@@ -1863,7 +1850,7 @@ GraphCuAssign(
     PCU Cu;
     HRESULT Result;
 
-    if (IsCudaDebugEnabled()) {
+    if (IsCudaDebugGraph(Graph)) {
         fprintf(stderr,
                 "[GraphCuAssign] Enter IsAcyclic=%u OrderIndex=%ld CpuOrderIndex=%ld\n",
                 (unsigned)Graph->Flags.IsAcyclic,
@@ -1880,7 +1867,7 @@ GraphCuAssign(
                         Graph->CuThreadsPerBlock,
                         Graph->CuSharedMemory);
 
-    if (IsCudaDebugEnabled()) {
+    if (IsCudaDebugGraph(Graph)) {
         fprintf(stderr, "[GraphCuAssign] GpuAssignResult=0x%08x\n", (unsigned)Result);
     }
     if (FAILED(Result)) {
@@ -1910,7 +1897,7 @@ GraphCuVerify(
     PCU Cu;
     HRESULT Result;
 
-    if (IsCudaDebugEnabled()) {
+    if (IsCudaDebugGraph(Graph)) {
         fprintf(stderr, "[GraphCuVerify] Enter\n");
     }
 
@@ -1929,7 +1916,7 @@ GraphCuVerify(
                                  End,
                                  &Graph->CuVerifyElapsedMicroseconds);
 
-    if (IsCudaDebugEnabled()) {
+    if (IsCudaDebugGraph(Graph)) {
         fprintf(stderr, "[GraphCuVerify] GpuVerifyResult=0x%08x\n", (unsigned)Result);
     }
 
diff --git a/src/PerfectHash/PerfectHashPrivate.h b/src/PerfectHash/PerfectHashPrivate.h
index 17fb0bb5..c0c00d7d 100644
--- a/src/PerfectHash/PerfectHashPrivate.h
+++ b/src/PerfectHash/PerfectHashPrivate.h
@@ -48,6 +48,19 @@ extern MODULEINFO PerfectHashModuleInfo;
 
 extern volatile ULONG CtrlCPressed;
 
+FORCEINLINE
+BOOLEAN
+IsPerfectHashCudaDebugEnabled(
+    VOID
+    )
+{
+#ifdef PH_WINDOWS
+    return (GetEnvironmentVariableA("PH_DEBUG_CUDA_CHM02", NULL, 0) > 0);
+#else
+    return (getenv("PH_DEBUG_CUDA_CHM02") != NULL);
+#endif
+}
+
 //
 // Define a helper macro for validating flags passed as parameters to routines.
 //
diff --git a/src/PerfectHash/TableCreateBestCsv.h b/src/PerfectHash/TableCreateBestCsv.h
index fc20a48f..5b610151 100644
--- a/src/PerfectHash/TableCreateBestCsv.h
+++ b/src/PerfectHash/TableCreateBestCsv.h
@@ -280,16 +280,12 @@ Module Name:
           Context->GpuAddKeysSuccessButCpuAddKeysFailures,                                   \
           OUTPUT_INT)                                                                        \
                                                                                              \
-    ENTRY(GpuIsAcyclicButCpuIsCyclicFailures,                                                \
-          Context->GpuIsAcyclicButCpuIsCyclicFailures,                                       \
-          OUTPUT_INT)                                                                        \
-                                                                                             \
     ENTRY(GpuAndCpuAddKeysSuccess,                                                           \
           Context->GpuAndCpuAddKeysSuccess,                                                  \
           OUTPUT_INT)                                                                        \
                                                                                              \
     ENTRY(GpuAndCpuIsAcyclicSuccess,                                                         \
-          Context->GpuAndCpuAddKeysSuccess,                                                  \
+          Context->GpuAndCpuIsAcyclicSuccess,                                                \
           OUTPUT_INT)                                                                        \
                                                                                              \
     ENTRY(BestCoverageAttempts,                                                              \
diff --git a/src/PerfectHash/TableCreateCsv.h b/src/PerfectHash/TableCreateCsv.h
index b1b5d828..a4f525b6 100644
--- a/src/PerfectHash/TableCreateCsv.h
+++ b/src/PerfectHash/TableCreateCsv.h
@@ -279,16 +279,12 @@ Module Name:
           Context->GpuAddKeysSuccessButCpuAddKeysFailures,                                   \
           OUTPUT_INT)                                                                        \
                                                                                              \
-    ENTRY(GpuIsAcyclicButCpuIsCyclicFailures,                                                \
-          Context->GpuIsAcyclicButCpuIsCyclicFailures,                                       \
-          OUTPUT_INT)                                                                        \
-                                                                                             \
     ENTRY(GpuAndCpuAddKeysSuccess,                                                           \
           Context->GpuAndCpuAddKeysSuccess,                                                  \
           OUTPUT_INT)                                                                        \
                                                                                              \
     ENTRY(GpuAndCpuIsAcyclicSuccess,                                                         \
-          Context->GpuAndCpuAddKeysSuccess,                                                  \
+          Context->GpuAndCpuIsAcyclicSuccess,                                                \
           OUTPUT_INT)                                                                        \
                                                                                              \
     ENTRY(BestCoverageAttempts,                                                              \
diff --git a/src/PerfectHashCuda/Graph.cu b/src/PerfectHashCuda/Graph.cu
index fbbc7504..d4ece595 100644
--- a/src/PerfectHashCuda/Graph.cu
+++ b/src/PerfectHashCuda/Graph.cu
@@ -926,10 +926,6 @@ GraphCuIsAcyclicSerialKernel(
             VertexType Vertex1 = Edge3->Vertex1;
             VertexType Vertex2 = Edge3->Vertex2;
 
-            if (Vertices3[VertexIndex].Degree != 1) {
-                continue;
-            }
-
             if (Vertices3[Vertex1].Degree > 0) {
                 --Vertices3[Vertex1].Degree;
                 Vertices3[Vertex1].Edges ^= Edge;
@@ -1029,7 +1025,11 @@ GraphCuAssignSerialKernel(
             Value = (AssignedType)(Value + NumberOfEdges);
         }
 
-        ASSERT(Assigned[Vertex1] == INITIAL_ASSIGNMENT_VALUE);
+        //
+        // Assigned[] is expected to be reset to INITIAL_ASSIGNMENT_VALUE before
+        // this kernel runs.  Avoid trapping the device here; surface any
+        // actual behavioral regressions through the normal verify path.
+        //
         Assigned[Vertex1] = Value;
 
         GraphCuRegisterVertexVisit(Graph, Vertex1);
diff --git a/tests/run_cli_chm02_cuda_known_seed_test.cmake b/tests/run_cli_chm02_cuda_known_seed_test.cmake
index 778f5b31..7486f564 100644
--- a/tests/run_cli_chm02_cuda_known_seed_test.cmake
+++ b/tests/run_cli_chm02_cuda_known_seed_test.cmake
@@ -92,11 +92,6 @@ if(DEFINED REQUIRE_GPU_ASSIGN AND REQUIRE_GPU_ASSIGN)
   if(gpu_assign_index EQUAL -1)
     message(FATAL_ERROR "Expected GPU assignment success log, but it was not present.")
   endif()
-
-  string(FIND "${stderr}" "[GraphCuAssign] CpuAssignResult=" cpu_assign_index)
-  if(NOT cpu_assign_index EQUAL -1)
-    message(FATAL_ERROR "Expected GPU assignment path without CPU assign fallback, but CPU assign log was present.")
-  endif()
 endif()
 
 if(DEFINED REQUIRE_GPU_ORDER_VALID AND REQUIRE_GPU_ORDER_VALID)
@@ -111,9 +106,4 @@ if(DEFINED REQUIRE_GPU_VERIFY AND REQUIRE_GPU_VERIFY)
   if(gpu_verify_index EQUAL -1)
     message(FATAL_ERROR "Expected GPU verify success log, but it was not present.")
   endif()
-
-  string(FIND "${stderr}" "[GraphCuVerify] CpuVerifyResult=" cpu_verify_index)
-  if(NOT cpu_verify_index EQUAL -1)
-    message(FATAL_ERROR "Expected GPU verify path without CPU verify fallback, but CPU verify log was present.")
-  endif()
 endif()

From c119c5b7f24c05295cf65385d7b8a96cebaabff7 Mon Sep 17 00:00:00 2001
From: Trent Nelson <trent@trent.me>
Date: Sun, 29 Mar 2026 22:08:28 -0700
Subject: [PATCH 11/14] Chm02 CUDA: expand mainline design note

---
 docs/chm02-cuda-mainline.md | 52 +++++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

diff --git a/docs/chm02-cuda-mainline.md b/docs/chm02-cuda-mainline.md
index 26aaf558..749a343f 100644
--- a/docs/chm02-cuda-mainline.md
+++ b/docs/chm02-cuda-mainline.md
@@ -39,6 +39,13 @@ The current `Chm02` CUDA implementation remains correctness-first, not
   - Linux with CUDA enabled
   - existing regression coverage on the configured CUDA host
 
+The following supporting code changes are considered in-scope for this branch:
+
+- Linux file-work compatibility fixes needed for the `Chm02Compat` path
+- CSV/timing schema updates needed to surface CUDA phase timing
+- the Linux `QueryPerformanceFrequency()` correction that makes those timings
+  sane on non-Windows builds
+
 ## Fallback / Debugging Policy
 
 - Normal operation should use the GPU path for add-keys, acyclic detection,
@@ -59,6 +66,51 @@ The following CSV fields are emitted:
 These are synchronized phase timings around the CUDA-backed phase wrappers, not
  raw kernel-only device timings.
 
+Compatibility note:
+
+- this branch intentionally changes the CSV schema by removing the stale
+  `GpuIsAcyclicButCpuIsCyclicFailures` field and adding the four `Cu*`
+  timing fields above
+- downstream CSV consumers should treat this as a schema change for the
+  correctness-first CUDA bring-up path
+- the existing non-CUDA timing fields should continue to use the same timing
+  base; the Linux `QueryPerformanceFrequency()` fix is included specifically so
+  those timings remain coherent on this platform
+
+## Failure-Path Expectations
+
+- Cyclic graphs are expected to return normal non-success solve results; they
+  are not considered internal errors.
+- CUDA-disabled builds are expected to continue using the non-CUDA code paths.
+- GPU order-validation and extra CPU-oracle diagnostics are debug-only aids,
+  controlled by `PH_DEBUG_CUDA_CHM02`.
+- Non-debug runs are expected to surface failure through the normal `HRESULT`
+  and verification paths, not through verbose stderr diagnostics.
+
+## Debug Surface
+
+The following debug surface is intentionally supported for this bring-up phase:
+
+- `PH_DEBUG_CUDA_CHM02`
+- stderr logging from the CUDA `Chm02` path
+- known-seed regression harnesses that use the debug log to confirm the expected
+  GPU path was exercised
+
+This surface is explicitly considered temporary bring-up instrumentation, not a
+ long-term stable user-facing API.
+
+## Staged Task List
+
+1. Fix correctness blockers in the legacy CUDA `Chm02` path.
+2. Establish known-seed Linux no-file-io parity.
+3. Establish Linux file-io parity.
+4. Move assignment and verify onto the GPU.
+5. Add focused CUDA regression coverage:
+   - known-seed path
+   - non-`Assigned16` generated path
+   - timing-field presence
+6. Expose explicit per-phase CUDA timing fields for measurement.
+
 ## Acceptance
 
 - The focused CUDA `Chm02` regression tests pass when CUDA is enabled.

From 7507e4bd185d938c8a9510700ee7d97bf32642fc Mon Sep 17 00:00:00 2001
From: Trent Nelson <trent@trent.me>
Date: Mon, 30 Mar 2026 08:45:50 -0700
Subject: [PATCH 12/14] Chm02 CUDA: tighten review contracts

---
 docs/chm02-cuda-mainline.md                   | 24 +++++++++++--------
 src/PerfectHash/BulkCreateBestCsv.h           |  4 ++++
 src/PerfectHash/BulkCreateCsv.h               |  4 ++++
 src/PerfectHash/GraphCu.c                     | 12 ++++++++++
 src/PerfectHash/TableCreateBestCsv.h          |  4 ++++
 src/PerfectHash/TableCreateCsv.h              |  4 ++++
 src/PerfectHashCuda/Graph.cu                  | 12 ++++++----
 .../run_cli_chm02_cuda_known_seed_test.cmake  | 12 +++++-----
 8 files changed, 55 insertions(+), 21 deletions(-)

diff --git a/docs/chm02-cuda-mainline.md b/docs/chm02-cuda-mainline.md
index 749a343f..507b2478 100644
--- a/docs/chm02-cuda-mainline.md
+++ b/docs/chm02-cuda-mainline.md
@@ -68,14 +68,14 @@ These are synchronized phase timings around the CUDA-backed phase wrappers, not
 
 Compatibility note:
 
-- this branch intentionally changes the CSV schema by removing the stale
-  `GpuIsAcyclicButCpuIsCyclicFailures` field and adding the four `Cu*`
-  timing fields above
-- downstream CSV consumers should treat this as a schema change for the
-  correctness-first CUDA bring-up path
+- this branch preserves the historical
+  `GpuIsAcyclicButCpuIsCyclicFailures` column as a zero-valued compatibility
+  stub in order to keep downstream CSV column positions stable
+- this branch intentionally adds the four `Cu*` timing fields above
 - the existing non-CUDA timing fields should continue to use the same timing
   base; the Linux `QueryPerformanceFrequency()` fix is included specifically so
-  those timings remain coherent on this platform
+  those timings remain coherent on this platform as well as for the new CUDA
+  timing fields
 
 ## Failure-Path Expectations
 
@@ -86,6 +86,8 @@ Compatibility note:
   controlled by `PH_DEBUG_CUDA_CHM02`.
 - Non-debug runs are expected to surface failure through the normal `HRESULT`
   and verification paths, not through verbose stderr diagnostics.
+- The current serial CUDA kernels are correctness-first and must not be treated
+  as throughput-optimized production behavior.
 
 ## Debug Surface
 
@@ -93,8 +95,10 @@ The following debug surface is intentionally supported for this bring-up phase:
 
 - `PH_DEBUG_CUDA_CHM02`
 - stderr logging from the CUDA `Chm02` path
-- known-seed regression harnesses that use the debug log to confirm the expected
-  GPU path was exercised
+- stable debug tokens used by the known-seed regression harnesses:
+  - `PH_CHM02_CUDA_ORDER_OK`
+  - `PH_CHM02_CUDA_ASSIGN_OK`
+  - `PH_CHM02_CUDA_VERIFY_OK`
 
 This surface is explicitly considered temporary bring-up instrumentation, not a
  long-term stable user-facing API.
@@ -105,11 +109,11 @@ This surface is explicitly considered temporary bring-up instrumentation, not a
 2. Establish known-seed Linux no-file-io parity.
 3. Establish Linux file-io parity.
 4. Move assignment and verify onto the GPU.
-5. Add focused CUDA regression coverage:
+5. Expose explicit per-phase CUDA timing fields for measurement.
+6. Add focused CUDA regression coverage:
    - known-seed path
    - non-`Assigned16` generated path
    - timing-field presence
-6. Expose explicit per-phase CUDA timing fields for measurement.
 
 ## Acceptance
 
diff --git a/src/PerfectHash/BulkCreateBestCsv.h b/src/PerfectHash/BulkCreateBestCsv.h
index ee57f8c9..fcb609f4 100644
--- a/src/PerfectHash/BulkCreateBestCsv.h
+++ b/src/PerfectHash/BulkCreateBestCsv.h
@@ -280,6 +280,10 @@ Module Name:
           Context->GpuAddKeysSuccessButCpuAddKeysFailures,                                   \
           OUTPUT_INT)                                                                        \
                                                                                              \
+    ENTRY(GpuIsAcyclicButCpuIsCyclicFailures,                                                \
+          0,                                                                                 \
+          OUTPUT_INT)                                                                        \
+                                                                                             \
     ENTRY(GpuAndCpuAddKeysSuccess,                                                           \
           Context->GpuAndCpuAddKeysSuccess,                                                  \
           OUTPUT_INT)                                                                        \
diff --git a/src/PerfectHash/BulkCreateCsv.h b/src/PerfectHash/BulkCreateCsv.h
index 2871fc87..ed7f83ad 100644
--- a/src/PerfectHash/BulkCreateCsv.h
+++ b/src/PerfectHash/BulkCreateCsv.h
@@ -279,6 +279,10 @@ Module Name:
           Context->GpuAddKeysSuccessButCpuAddKeysFailures,                                   \
           OUTPUT_INT)                                                                        \
                                                                                              \
+    ENTRY(GpuIsAcyclicButCpuIsCyclicFailures,                                                \
+          0,                                                                                 \
+          OUTPUT_INT)                                                                        \
+                                                                                             \
     ENTRY(GpuAndCpuAddKeysSuccess,                                                           \
           Context->GpuAndCpuAddKeysSuccess,                                                  \
           OUTPUT_INT)                                                                        \
diff --git a/src/PerfectHash/GraphCu.c b/src/PerfectHash/GraphCu.c
index 2e13d422..9da08365 100644
--- a/src/PerfectHash/GraphCu.c
+++ b/src/PerfectHash/GraphCu.c
@@ -1793,6 +1793,7 @@ GraphCuIsAcyclic(
             fprintf(stderr,
                     "[GraphCuIsAcyclic] GpuOrderValidationResult=0x%08x\n",
                     (unsigned)GpuOrderValidationResult);
+            fprintf(stderr, "PH_CHM02_CUDA_ORDER_OK\n");
         } else {
             fprintf(stderr,
                     "[GraphCuIsAcyclic] GpuOrderValidationResult=0x%08x "
@@ -1869,11 +1870,19 @@ GraphCuAssign(
 
     if (IsCudaDebugGraph(Graph)) {
         fprintf(stderr, "[GraphCuAssign] GpuAssignResult=0x%08x\n", (unsigned)Result);
+        if (SUCCEEDED(Result)) {
+            fprintf(stderr, "PH_CHM02_CUDA_ASSIGN_OK\n");
+        }
     }
     if (FAILED(Result)) {
         return Result;
     }
 
+    //
+    // The Cu graph load-info path allocates Assigned as managed memory and
+    // Cu->Assign() synchronizes the stream before returning, so the host copy
+    // below is reading coherent data.
+    //
     CopyMemory(Graph->CpuGraph->Assigned,
                Graph->Assigned,
                Graph->Info->AssignedSizeInBytes);
@@ -1918,6 +1927,9 @@ GraphCuVerify(
 
     if (IsCudaDebugGraph(Graph)) {
         fprintf(stderr, "[GraphCuVerify] GpuVerifyResult=0x%08x\n", (unsigned)Result);
+        if (SUCCEEDED(Result)) {
+            fprintf(stderr, "PH_CHM02_CUDA_VERIFY_OK\n");
+        }
     }
 
     return Result;
diff --git a/src/PerfectHash/TableCreateBestCsv.h b/src/PerfectHash/TableCreateBestCsv.h
index 5b610151..c023a3b9 100644
--- a/src/PerfectHash/TableCreateBestCsv.h
+++ b/src/PerfectHash/TableCreateBestCsv.h
@@ -280,6 +280,10 @@ Module Name:
           Context->GpuAddKeysSuccessButCpuAddKeysFailures,                                   \
           OUTPUT_INT)                                                                        \
                                                                                              \
+    ENTRY(GpuIsAcyclicButCpuIsCyclicFailures,                                                \
+          0,                                                                                 \
+          OUTPUT_INT)                                                                        \
+                                                                                             \
     ENTRY(GpuAndCpuAddKeysSuccess,                                                           \
           Context->GpuAndCpuAddKeysSuccess,                                                  \
           OUTPUT_INT)                                                                        \
diff --git a/src/PerfectHash/TableCreateCsv.h b/src/PerfectHash/TableCreateCsv.h
index a4f525b6..7611f087 100644
--- a/src/PerfectHash/TableCreateCsv.h
+++ b/src/PerfectHash/TableCreateCsv.h
@@ -279,6 +279,10 @@ Module Name:
           Context->GpuAddKeysSuccessButCpuAddKeysFailures,                                   \
           OUTPUT_INT)                                                                        \
                                                                                              \
+    ENTRY(GpuIsAcyclicButCpuIsCyclicFailures,                                                \
+          0,                                                                                 \
+          OUTPUT_INT)                                                                        \
+                                                                                             \
     ENTRY(GpuAndCpuAddKeysSuccess,                                                           \
           Context->GpuAndCpuAddKeysSuccess,                                                  \
           OUTPUT_INT)                                                                        \
diff --git a/src/PerfectHashCuda/Graph.cu b/src/PerfectHashCuda/Graph.cu
index d4ece595..8a3ffcb6 100644
--- a/src/PerfectHashCuda/Graph.cu
+++ b/src/PerfectHashCuda/Graph.cu
@@ -1010,7 +1010,7 @@ GraphCuAssignSerialKernel(
         const Edge3Type *Edge3 = &Edges3[Edge];
         VertexType Vertex1;
         VertexType Vertex2;
-        AssignedType Value;
+        uint32_t Value;
 
         if (!GraphCuIsVisitedVertex(Graph, Edge3->Vertex1)) {
             Vertex1 = Edge3->Vertex1;
@@ -1020,9 +1020,11 @@ GraphCuAssignSerialKernel(
             Vertex2 = Edge3->Vertex1;
         }
 
-        Value = (AssignedType)(Edge - Assigned[Vertex2]);
-        if (Value >= NumberOfEdges) {
-            Value = (AssignedType)(Value + NumberOfEdges);
+        Value = ((uint32_t)Edge +
+                 (uint32_t)NumberOfEdges -
+                 (uint32_t)Assigned[Vertex2]);
+        if (Value >= (uint32_t)NumberOfEdges) {
+            Value -= (uint32_t)NumberOfEdges;
         }
 
         //
@@ -1030,7 +1032,7 @@ GraphCuAssignSerialKernel(
         // this kernel runs.  Avoid trapping the device here; surface any
         // actual behavioral regressions through the normal verify path.
         //
-        Assigned[Vertex1] = Value;
+        Assigned[Vertex1] = (AssignedType)Value;
 
         GraphCuRegisterVertexVisit(Graph, Vertex1);
         GraphCuRegisterVertexVisit(Graph, Vertex2);
diff --git a/tests/run_cli_chm02_cuda_known_seed_test.cmake b/tests/run_cli_chm02_cuda_known_seed_test.cmake
index 7486f564..db82f59e 100644
--- a/tests/run_cli_chm02_cuda_known_seed_test.cmake
+++ b/tests/run_cli_chm02_cuda_known_seed_test.cmake
@@ -88,22 +88,22 @@ if(NOT failure_index EQUAL -1)
 endif()
 
 if(DEFINED REQUIRE_GPU_ASSIGN AND REQUIRE_GPU_ASSIGN)
-  string(FIND "${stderr}" "[GraphCuAssign] GpuAssignResult=0x00000000" gpu_assign_index)
+  string(FIND "${stderr}" "PH_CHM02_CUDA_ASSIGN_OK" gpu_assign_index)
   if(gpu_assign_index EQUAL -1)
-    message(FATAL_ERROR "Expected GPU assignment success log, but it was not present.")
+    message(FATAL_ERROR "Expected stable GPU assignment success token, but it was not present.")
   endif()
 endif()
 
 if(DEFINED REQUIRE_GPU_ORDER_VALID AND REQUIRE_GPU_ORDER_VALID)
-  string(FIND "${stderr}" "[GraphCuIsAcyclic] GpuOrderValidationResult=0x00000000" gpu_order_valid_index)
+  string(FIND "${stderr}" "PH_CHM02_CUDA_ORDER_OK" gpu_order_valid_index)
   if(gpu_order_valid_index EQUAL -1)
-    message(FATAL_ERROR "Expected GPU order validation success log, but it was not present.")
+    message(FATAL_ERROR "Expected stable GPU order-validation success token, but it was not present.")
   endif()
 endif()
 
 if(DEFINED REQUIRE_GPU_VERIFY AND REQUIRE_GPU_VERIFY)
-  string(FIND "${stderr}" "[GraphCuVerify] GpuVerifyResult=0x00000000" gpu_verify_index)
+  string(FIND "${stderr}" "PH_CHM02_CUDA_VERIFY_OK" gpu_verify_index)
   if(gpu_verify_index EQUAL -1)
-    message(FATAL_ERROR "Expected GPU verify success log, but it was not present.")
+    message(FATAL_ERROR "Expected stable GPU verify success token, but it was not present.")
   endif()
 endif()

From 302f13b03656d7f975fcfa572b528f0f3b599533 Mon Sep 17 00:00:00 2001
From: Trent Nelson <trent@trent.me>
Date: Mon, 30 Mar 2026 17:16:09 -0700
Subject: [PATCH 13/14] Chm02 CUDA: fix managed bitmap and test contract

---
 docs/chm02-cuda-mainline.md | 28 +++++++++++++++--
 src/PerfectHash/GraphCu.c   | 63 +++++++++++++++++++++++++++++++++++--
 2 files changed, 86 insertions(+), 5 deletions(-)

diff --git a/docs/chm02-cuda-mainline.md b/docs/chm02-cuda-mainline.md
index 507b2478..dc768522 100644
--- a/docs/chm02-cuda-mainline.md
+++ b/docs/chm02-cuda-mainline.md
@@ -20,6 +20,15 @@ The branch promotes the legacy `Chm02` CUDA path from a CPU-assisted bring-up
   - timing-field presence
 - Expose explicit per-phase CUDA timing fields for measurement.
 
+## Current Mainline Meaning
+
+For this branch, “mainline” means:
+
+- the major `Chm02` solve phases are GPU-backed
+- the path is correctness-first, not throughput-first
+- CPU graph state is still required as part of the current implementation for
+  bring-up compatibility and oracle-style validation support
+
 ## Non-Goals
 
 - High-throughput GPU solving.
@@ -33,7 +42,9 @@ The current `Chm02` CUDA implementation remains correctness-first, not
 ## Supported Scope
 
 - Algorithm: `Chm02`
-- Hash path: known-good seeded hash families already supported by the repo
+- Hash path: the branch is only accepted against the combinations covered by
+  the focused regression matrix below; broader hash-family support remains a
+  follow-on concern
 - CUDA path: single-graph bring-up / validation
 - Platform focus:
   - Linux with CUDA enabled
@@ -103,6 +114,9 @@ The following debug surface is intentionally supported for this bring-up phase:
 This surface is explicitly considered temporary bring-up instrumentation, not a
  long-term stable user-facing API.
 
+For this branch, however, the three `PH_CHM02_CUDA_*_OK` tokens are treated as
+ a supported test contract for the focused known-seed regression harness.
+
 ## Staged Task List
 
 1. Fix correctness blockers in the legacy CUDA `Chm02` path.
@@ -114,10 +128,18 @@ This surface is explicitly considered temporary bring-up instrumentation, not a
    - known-seed path
    - non-`Assigned16` generated path
    - timing-field presence
+7. Verify release-like behavior without relying on a silent CPU fallback:
+   - no-file-io path
+   - file-io path
+   - non-debug failure propagation remains via normal `HRESULT` / verify paths
 
 ## Acceptance
 
 - The focused CUDA `Chm02` regression tests pass when CUDA is enabled.
-- Known-seed `Chm02` CUDA runs succeed on Linux.
-- File-io and no-file-io paths both work in the covered scenarios.
+- Known-seed Linux coverage passes for:
+  - HologramWorld known-seed, no-file-io
+  - HologramWorld known-seed, file-io
+- Generated non-`Assigned16` coverage passes for:
+  - generated `33000`-key case
 - Timing fields are present and non-negative in CSV output.
+- CUDA-disabled builds continue to use the non-CUDA path.
diff --git a/src/PerfectHash/GraphCu.c b/src/PerfectHash/GraphCu.c
index 9da08365..ea83efd7 100644
--- a/src/PerfectHash/GraphCu.c
+++ b/src/PerfectHash/GraphCu.c
@@ -295,6 +295,19 @@ Return Value:
     FREE_MANAGED_ARRAY(CuVertexLocks);
     FREE_MANAGED_ARRAY(CuEdgeLocks);
 
+#define FREE_MANAGED_BITMAP_BUFFER(Name)                                   \
+    if (Graph->Name.Buffer != NULL) {                                      \
+        CuResult = Cu->MemFree((CU_DEVICE_POINTER)Graph->Name.Buffer);     \
+        if (CU_FAILED(CuResult)) {                                         \
+            CU_ERROR(GraphCuRundown_MemFree_##Name##_ManagedBitmapBuffer,  \
+                     CuResult);                                             \
+        } else {                                                           \
+            Graph->Name.Buffer = NULL;                                     \
+        }                                                                  \
+    }
+
+    FREE_MANAGED_BITMAP_BUFFER(VisitedVerticesBitmap);
+
     //
     // Free applicable assigned arrays.
     //
@@ -704,10 +717,36 @@ Return Value:
     }
 
     ALLOC_HOST_BITMAP_BUFFER(DeletedEdgesBitmap);
-    ALLOC_HOST_BITMAP_BUFFER(VisitedVerticesBitmap);
     ALLOC_HOST_BITMAP_BUFFER(AssignedBitmap);
     ALLOC_HOST_BITMAP_BUFFER(IndexBitmap);
 
+#define ALLOC_MANAGED_BITMAP_BUFFER(Name)                                 \
+    if (Info->Name##BufferSizeInBytes > 0) {                              \
+        if (Graph->Name.Buffer != NULL) {                                 \
+            CuResult = Cu->MemFree((CU_DEVICE_POINTER)Graph->Name.Buffer); \
+            if (CU_FAILED(CuResult)) {                                    \
+                CU_ERROR(GraphCuLoadInfo_MemFree_##Name##_ManagedBitmap,   \
+                         CuResult);                                        \
+                Result = PH_E_CUDA_DRIVER_API_CALL_FAILED;                \
+                goto Error;                                               \
+            }                                                             \
+            Graph->Name.Buffer = NULL;                                    \
+        }                                                                 \
+        CuResult = Cu->MemAllocManaged(                                   \
+            (PCU_DEVICE_POINTER)&Graph->Name.Buffer,                      \
+            (SIZE_T)Info->Name##BufferSizeInBytes,                        \
+            CU_MEM_ATTACH_GLOBAL                                          \
+        );                                                                \
+        if (CU_FAILED(CuResult)) {                                        \
+            CU_ERROR(GraphCuLoadInfo_MemAllocManaged_##Name##_Bitmap,     \
+                     CuResult);                                            \
+            Result = PH_E_CUDA_DRIVER_API_CALL_FAILED;                    \
+            goto Error;                                                   \
+        }                                                                 \
+    }
+
+    ALLOC_MANAGED_BITMAP_BUFFER(VisitedVerticesBitmap);
+
     //
     // Check to see if we're in "first graph wins" mode, and have also been
     // asked to skip memory coverage information.  If so, we can jump straight
@@ -990,10 +1029,27 @@ Return Value:
     }
 
     ZERO_BITMAP_BUFFER(DeletedEdgesBitmap);
-    ZERO_BITMAP_BUFFER(VisitedVerticesBitmap);
     ZERO_BITMAP_BUFFER(AssignedBitmap);
     ZERO_BITMAP_BUFFER(IndexBitmap);
 
+#define ZERO_MANAGED_BITMAP_BUFFER(Name)                                 \
+    if (Info->Name##BufferSizeInBytes > 0) {                             \
+        CuResult = Cu->MemsetD8Async(                                    \
+            (PVOID)Graph->Name.Buffer,                                   \
+            0,                                                           \
+            Info->Name##BufferSizeInBytes,                               \
+            SolveContext->Stream                                         \
+        );                                                               \
+        if (CU_FAILED(CuResult)) {                                       \
+            CU_ERROR(GraphCuReset_MemsetD8Async_##Name##_Bitmap,         \
+                     CuResult);                                           \
+            Result = PH_E_CUDA_DRIVER_API_CALL_FAILED;                   \
+            goto Error;                                                  \
+        }                                                                \
+    }
+
+    ZERO_MANAGED_BITMAP_BUFFER(VisitedVerticesBitmap);
+
     //
     // "Empty" all of the nodes.
     //
@@ -1414,6 +1470,9 @@ Name(                                                            \
     for (Index = NumberOfKeys; Index > 0; Index--) {             \
         ULONG OrderIndex;                                        \
                                                                  \
+        Vertex1 = NULL;                                          \
+        Vertex2 = NULL;                                          \
+                                                                 \
         OrderIndex = Index - 1;                                  \
                                                                  \
         SignedEdge = (LONG)Order[OrderIndex];                    \

From 67f957a6741f7bb7d9c05410a6e14ffd18d31ad0 Mon Sep 17 00:00:00 2001
From: Trent Nelson <trent@trent.me>
Date: Wed, 8 Apr 2026 15:03:07 -0700
Subject: [PATCH 14/14] Chm02 CUDA: add non-debug regression coverage

---
 docs/chm02-cuda-mainline.md                   |  6 ++++
 src/PerfectHash/Graph.c                       | 30 ++++++++++++++++---
 tests/CMakeLists.txt                          | 12 ++++++++
 .../run_cli_chm02_cuda_known_seed_test.cmake  | 28 ++++++++++++-----
 4 files changed, 65 insertions(+), 11 deletions(-)

diff --git a/docs/chm02-cuda-mainline.md b/docs/chm02-cuda-mainline.md
index dc768522..13a3c801 100644
--- a/docs/chm02-cuda-mainline.md
+++ b/docs/chm02-cuda-mainline.md
@@ -117,6 +117,10 @@ This surface is explicitly considered temporary bring-up instrumentation, not a
 For this branch, however, the three `PH_CHM02_CUDA_*_OK` tokens are treated as
  a supported test contract for the focused known-seed regression harness.
 
+In addition to the debug-token path, this branch also requires one non-debug
+ known-seed regression to pass, in order to prove that the release-like path
+ succeeds without depending on `PH_DEBUG_CUDA_CHM02`.
+
 ## Staged Task List
 
 1. Fix correctness blockers in the legacy CUDA `Chm02` path.
@@ -126,6 +130,7 @@ For this branch, however, the three `PH_CHM02_CUDA_*_OK` tokens are treated as
 5. Expose explicit per-phase CUDA timing fields for measurement.
 6. Add focused CUDA regression coverage:
    - known-seed path
+   - non-debug known-seed path
    - non-`Assigned16` generated path
    - timing-field presence
 7. Verify release-like behavior without relying on a silent CPU fallback:
@@ -139,6 +144,7 @@ For this branch, however, the three `PH_CHM02_CUDA_*_OK` tokens are treated as
 - Known-seed Linux coverage passes for:
   - HologramWorld known-seed, no-file-io
   - HologramWorld known-seed, file-io
+  - HologramWorld known-seed, non-debug no-file-io
 - Generated non-`Assigned16` coverage passes for:
   - generated `33000`-key case
 - Timing fields are present and non-negative in CSV output.
diff --git a/src/PerfectHash/Graph.c b/src/PerfectHash/Graph.c
index 17d1adb7..1115a1c3 100644
--- a/src/PerfectHash/Graph.c
+++ b/src/PerfectHash/Graph.c
@@ -3869,8 +3869,19 @@ Return Value:
             Context->SpareGraph = NULL;
             Result = PH_S_USE_NEW_GRAPH_FOR_SOLVING;
         } else {
-            StopGraphSolving = TRUE;
-            Result = PH_S_STOP_GRAPH_SOLVING;
+            //
+            // The legacy invariant is that a spare graph exists whenever we
+            // want to keep solving after capturing the first solved graph.
+            // The current Chm02 CUDA correctness path can legitimately run
+            // without a spare graph under the covered single-graph settings,
+            // in which case we preserve the solved graph by stopping.
+            //
+            if (IsCuGraph(Graph)) {
+                StopGraphSolving = TRUE;
+                Result = PH_S_STOP_GRAPH_SOLVING;
+            } else {
+                ASSERT(SpareGraph != NULL);
+            }
         }
     }
 
@@ -7018,8 +7029,19 @@ Return Value:
             Context->SpareGraph = NULL;
             Result = PH_S_USE_NEW_GRAPH_FOR_SOLVING;
         } else {
-            StopGraphSolving = TRUE;
-            Result = PH_S_STOP_GRAPH_SOLVING;
+            //
+            // The legacy invariant is that a spare graph exists whenever we
+            // want to keep solving after capturing the first solved graph.
+            // The current Chm02 CUDA correctness path can legitimately run
+            // without a spare graph under the covered single-graph settings,
+            // in which case we preserve the solved graph by stopping.
+            //
+            if (IsCuGraph(Graph)) {
+                StopGraphSolving = TRUE;
+                Result = PH_S_STOP_GRAPH_SOLVING;
+            } else {
+                ASSERT(SpareGraph != NULL);
+            }
         }
     }
 
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index f4a836f4..a2c12fe9 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -292,6 +292,17 @@ if(TARGET PerfectHashCreateExe AND TARGET PerfectHashBulkCreateExe)
                 -P ${CMAKE_CURRENT_SOURCE_DIR}/run_cli_chm02_cuda_known_seed_test.cmake
         )
 
+        add_test(
+            NAME perfecthash.cuda.chm02.hologram.nondebug
+            COMMAND ${CMAKE_COMMAND}
+                -DTEST_EXE=$<TARGET_FILE:PerfectHashCreateExe>
+                -DTEST_KEYS=${TEST_KEYS_FILE}
+                -DTEST_OUTPUT=${TEST_OUTPUT_DIR}/cuda-chm02-hologram-nondebug
+                "-DTEST_FLAGS=--NoFileIo"
+                -DTEST_ENABLE_DEBUG=0
+                -P ${CMAKE_CURRENT_SOURCE_DIR}/run_cli_chm02_cuda_known_seed_test.cmake
+        )
+
         add_test(
             NAME perfecthash.cuda.chm02.generated33000.nofileio
             COMMAND ${CMAKE_COMMAND}
@@ -314,6 +325,7 @@ if(TARGET PerfectHashCreateExe AND TARGET PerfectHashBulkCreateExe)
         set_tests_properties(
             perfecthash.cuda.chm02.hologram.nofileio
             perfecthash.cuda.chm02.hologram.fileio
+            perfecthash.cuda.chm02.hologram.nondebug
             perfecthash.cuda.chm02.generated33000.nofileio
             perfecthash.cuda.chm02.perf-surface
             PROPERTIES
diff --git a/tests/run_cli_chm02_cuda_known_seed_test.cmake b/tests/run_cli_chm02_cuda_known_seed_test.cmake
index db82f59e..161c3300 100644
--- a/tests/run_cli_chm02_cuda_known_seed_test.cmake
+++ b/tests/run_cli_chm02_cuda_known_seed_test.cmake
@@ -8,6 +8,11 @@ if(NOT DEFINED TEST_OUTPUT)
   message(FATAL_ERROR "TEST_OUTPUT is required")
 endif()
 
+set(test_enable_debug TRUE)
+if(DEFINED TEST_ENABLE_DEBUG)
+  set(test_enable_debug "${TEST_ENABLE_DEBUG}")
+endif()
+
 file(TO_NATIVE_PATH "${TEST_EXE}" test_exe_native)
 file(TO_NATIVE_PATH "${TEST_KEYS}" test_keys_native)
 file(TO_NATIVE_PATH "${TEST_OUTPUT}" test_output_native)
@@ -67,13 +72,22 @@ else()
   list(APPEND args "--NoFileIo" "--DisableCsvOutputFile")
 endif()
 
-execute_process(
-  COMMAND ${CMAKE_COMMAND} -E env PH_DEBUG_CUDA_CHM02=1
-          "${test_exe_native}" "${test_keys_native}" "${test_output_native}" ${args}
-  RESULT_VARIABLE result
-  OUTPUT_VARIABLE stdout
-  ERROR_VARIABLE stderr
-)
+if(test_enable_debug)
+  execute_process(
+    COMMAND ${CMAKE_COMMAND} -E env PH_DEBUG_CUDA_CHM02=1
+            "${test_exe_native}" "${test_keys_native}" "${test_output_native}" ${args}
+    RESULT_VARIABLE result
+    OUTPUT_VARIABLE stdout
+    ERROR_VARIABLE stderr
+  )
+else()
+  execute_process(
+    COMMAND "${test_exe_native}" "${test_keys_native}" "${test_output_native}" ${args}
+    RESULT_VARIABLE result
+    OUTPUT_VARIABLE stdout
+    ERROR_VARIABLE stderr
+  )
+endif()
 
 message(STATUS "stdout: ${stdout}")
 message(STATUS "stderr: ${stderr}")