From 7c06b9c3efce7297d29ab41ee6bc16a3c332ff55 Mon Sep 17 00:00:00 2001 From: Trent Nelson Date: Sun, 29 Mar 2026 21:00:33 -0700 Subject: [PATCH 01/14] Chm02 CUDA: advance single-graph bring-up --- src/PerfectHash/Graph.c | 62 ++++++++++ src/PerfectHash/GraphCu.c | 171 +++++++++++++++++++++------ src/PerfectHashCuda/Graph.cu | 221 +++++++++++++++++------------------ 3 files changed, 300 insertions(+), 154 deletions(-) diff --git a/src/PerfectHash/Graph.c b/src/PerfectHash/Graph.c index bc2deaba..9bb8d30b 100644 --- a/src/PerfectHash/Graph.c +++ b/src/PerfectHash/Graph.c @@ -544,6 +544,7 @@ Return Value: PPERFECT_HASH_CONTEXT Context; PASSIGNED_MEMORY_COVERAGE Coverage; PASSIGNED16_MEMORY_COVERAGE Coverage16; + BOOLEAN DebugCudaSolve; // // Initialize aliases. @@ -554,6 +555,13 @@ Return Value: Table = Context->Table; NumberOfKeys = Table->Keys->NumberOfKeys.LowPart; Edges = Keys = (PKEY)Table->Keys->KeyArrayBaseAddress; + DebugCudaSolve = FALSE; + +#ifdef PH_WINDOWS + DebugCudaSolve = (GetEnvironmentVariableA("PH_DEBUG_CUDA_CHM02", NULL, 0) > 0); +#else + DebugCudaSolve = (getenv("PH_DEBUG_CUDA_CHM02") != NULL); +#endif // // Attempt to add all the keys to the graph. @@ -561,6 +569,12 @@ Return Value: Result = Graph->Vtbl->AddKeys(Graph, NumberOfKeys, Keys); + if (DebugCudaSolve && IsCuGraph(Graph)) { + fprintf(stderr, + "[GraphSolve] AddKeys result=0x%08x\n", + (unsigned)Result); + } + if (FAILED(Result)) { // @@ -594,6 +608,14 @@ Return Value: // Result = Graph->Vtbl->IsAcyclic(Graph); + if (DebugCudaSolve && IsCuGraph(Graph)) { + fprintf(stderr, + "[GraphSolve] IsAcyclic result=0x%08x DeletedEdgeCount=%u " + "OrderIndex=%ld\n", + (unsigned)Result, + (unsigned)Graph->DeletedEdgeCount, + (long)Graph->OrderIndex); + } if (FAILED(Result)) { // @@ -643,6 +665,12 @@ Return Value: Result = Graph->Vtbl->Assign(Graph); + if (DebugCudaSolve && IsCuGraph(Graph)) { + fprintf(stderr, + "[GraphSolve] Assign result=0x%08x\n", + (unsigned)Result); + } + // // Assign() should always succeed. // @@ -3974,6 +4002,7 @@ Return Value: { PGRAPH NewGraph; HRESULT Result = S_OK; + BOOLEAN DebugCudaSolve; // // Validate arguments. @@ -3983,6 +4012,13 @@ Return Value: return E_POINTER; } + DebugCudaSolve = FALSE; +#ifdef PH_WINDOWS + DebugCudaSolve = (GetEnvironmentVariableA("PH_DEBUG_CUDA_CHM02", NULL, 0) > 0); +#else + DebugCudaSolve = (getenv("PH_DEBUG_CUDA_CHM02") != NULL); +#endif + // // Acquire the exclusive graph lock for the duration of the routine. The // graph should never be locked at this point; if it is, consider it a @@ -4001,6 +4037,12 @@ Return Value: Result = Graph->Vtbl->LoadInfo(Graph); + if (DebugCudaSolve && IsCuGraph(Graph)) { + fprintf(stderr, + "[GraphEnterSolvingLoop] LoadInfo result=0x%08x\n", + (unsigned)Result); + } + if (FAILED(Result)) { if (Result != E_OUTOFMEMORY) { @@ -4040,6 +4082,11 @@ Return Value: while (GraphShouldWeContinueTryingToSolve(Graph)) { Result = Graph->Vtbl->Reset(Graph); + if (DebugCudaSolve && IsCuGraph(Graph)) { + fprintf(stderr, + "[GraphEnterSolvingLoop] Reset result=0x%08x\n", + (unsigned)Result); + } if (FAILED(Result)) { PH_ERROR(GraphReset, Result); break; @@ -4048,6 +4095,16 @@ Return Value: } Result = Graph->Vtbl->LoadNewSeeds(Graph); + if (DebugCudaSolve && IsCuGraph(Graph)) { + fprintf(stderr, + "[GraphEnterSolvingLoop] LoadNewSeeds result=0x%08x " + "Seed1=%u Seed2=%u Seed3=%u Seed4=%u\n", + (unsigned)Result, + (unsigned)Graph->Seed1, + (unsigned)Graph->Seed2, + (unsigned)Graph->Seed3, + (unsigned)Graph->Seed4); + } if (FAILED(Result)) { // @@ -4061,6 +4118,11 @@ Return Value: NewGraph = NULL; Result = Graph->Vtbl->Solve(Graph, &NewGraph); + if (DebugCudaSolve && IsCuGraph(Graph)) { + fprintf(stderr, + "[GraphEnterSolvingLoop] Solve result=0x%08x\n", + (unsigned)Result); + } if (FAILED(Result)) { PH_ERROR(GraphSolve, Result); break; diff --git a/src/PerfectHash/GraphCu.c b/src/PerfectHash/GraphCu.c index bcf041e8..4ec65ab2 100644 --- a/src/PerfectHash/GraphCu.c +++ b/src/PerfectHash/GraphCu.c @@ -1317,6 +1317,19 @@ GraphCuLoadNewSeeds( return Result; } +FORCEINLINE +BOOLEAN +IsCudaDebugEnabled( + VOID + ) +{ +#ifdef PH_WINDOWS + return (GetEnvironmentVariableA("PH_DEBUG_CUDA_CHM02", NULL, 0) > 0); +#else + return (getenv("PH_DEBUG_CUDA_CHM02") != NULL); +#endif +} + HRESULT GraphCuAddKeys( _In_ PGRAPH Graph, @@ -1325,6 +1338,7 @@ GraphCuAddKeys( ) { PCU Cu; + HRESULT Result; // // Keys have already been prepared on the GPU, so we don't need to use @@ -1336,10 +1350,22 @@ GraphCuAddKeys( Cu = Graph->CuSolveContext->DeviceContext->Cu; - return Cu->AddKeys(Graph, - Graph->CuBlocksPerGrid, - Graph->CuThreadsPerBlock, - Graph->CuSharedMemory); + Result = Cu->AddKeys(Graph, + Graph->CuBlocksPerGrid, + Graph->CuThreadsPerBlock, + Graph->CuSharedMemory); + + if (IsCudaDebugEnabled()) { + fprintf(stderr, + "[GraphCuAddKeys] Result=0x%08x HashKeysResult=0x%08x " + "VertexFailures=%u WarpFailures=%u\n", + (unsigned)Result, + (unsigned)Graph->CuHashKeysResult, + (unsigned)Graph->CuVertexCollisionFailures, + (unsigned)Graph->CuWarpVertexCollisionFailures); + } + + return Result; } HRESULT @@ -1372,6 +1398,16 @@ GraphCuIsAcyclic( Graph->CuThreadsPerBlock, Graph->CuSharedMemory); + if (IsCudaDebugEnabled()) { + fprintf(stderr, + "[GraphCuIsAcyclic] GpuResult=0x%08x Attempts=%u " + "DeletedEdges=%u OrderIndex=%ld\n", + (unsigned)Result, + (unsigned)Graph->CuIsAcyclicPhase1Attempts, + (unsigned)Graph->DeletedEdgeCount, + (long)Graph->OrderIndex); + } + // // If we weren't acyclic, return. // @@ -1438,57 +1474,107 @@ GraphCuIsAcyclic( ASSERT(Graph->CpuGraph != NULL); ASSERT(Graph->Impl == 3); - if (IsUsingAssigned16(Graph)) { - Result = Graph->CpuGraph->Vtbl->AddKeys(Graph->CpuGraph, - NumberOfKeys, - Keys); - if (FAILED(Result)) { - InterlockedIncrement64( - &Context->GpuAddKeysSuccessButCpuAddKeysFailures); - return Result; - } else { - InterlockedIncrement64(&Context->GpuAndCpuAddKeysSuccess); + Result = Graph->CpuGraph->Vtbl->AddKeys(Graph->CpuGraph, + NumberOfKeys, + Keys); + if (FAILED(Result)) { + InterlockedIncrement64( + &Context->GpuAddKeysSuccessButCpuAddKeysFailures); + return Result; + } else { + InterlockedIncrement64(&Context->GpuAndCpuAddKeysSuccess); + } + + if (IsCudaDebugEnabled()) { + HRESULT CpuAcyclicResult; + ULONG MismatchIndex; + ULONG CpuOrder; + ULONG GpuOrder; + + CpuAcyclicResult = Graph->CpuGraph->Vtbl->IsAcyclic(Graph->CpuGraph); + fprintf(stderr, + "[GraphCuIsAcyclic] CpuAcyclicOracleResult=0x%08x\n", + (unsigned)CpuAcyclicResult); + + if (SUCCEEDED(CpuAcyclicResult)) { + MismatchIndex = (ULONG)-1; + + if (IsUsingAssigned16(Graph)) { + for (ULONG Index = 0; Index < NumberOfKeys; Index++) { + CpuOrder = ((PUSHORT)Graph->CpuGraph->Order16)[Index]; + GpuOrder = ((PUSHORT)Graph->Order16)[Index]; + if (CpuOrder != GpuOrder) { + MismatchIndex = Index; + fprintf(stderr, + "[GraphCuIsAcyclic] OrderMismatch16 index=%u " + "gpu=%u cpu=%u\n", + (unsigned)Index, + (unsigned)GpuOrder, + (unsigned)CpuOrder); + break; + } + } + } else { + for (ULONG Index = 0; Index < NumberOfKeys; Index++) { + CpuOrder = ((PULONG)Graph->CpuGraph->Order)[Index]; + GpuOrder = ((PULONG)Graph->Order)[Index]; + if (CpuOrder != GpuOrder) { + MismatchIndex = Index; + fprintf(stderr, + "[GraphCuIsAcyclic] OrderMismatch index=%u " + "gpu=%u cpu=%u\n", + (unsigned)Index, + (unsigned)GpuOrder, + (unsigned)CpuOrder); + break; + } + } + } + + if (MismatchIndex == (ULONG)-1) { + fprintf(stderr, + "[GraphCuIsAcyclic] Order arrays match CPU oracle.\n"); + } } - Result = Graph->CpuGraph->Vtbl->IsAcyclic(Graph->CpuGraph); + Result = Graph->CpuGraph->Vtbl->Reset(Graph->CpuGraph); if (FAILED(Result)) { - InterlockedIncrement64( - &Context->GpuIsAcyclicButCpuIsCyclicFailures); return Result; - } else { - InterlockedIncrement64(&Context->GpuAndCpuIsAcyclicSuccess); } - } else { + Result = Graph->CpuGraph->Vtbl->AddKeys(Graph->CpuGraph, NumberOfKeys, Keys); if (FAILED(Result)) { - InterlockedIncrement64( - &Context->GpuAddKeysSuccessButCpuAddKeysFailures); return Result; - } else { - InterlockedIncrement64(&Context->GpuAndCpuAddKeysSuccess); - } - - Result = Graph->CpuGraph->Vtbl->IsAcyclic(Graph->CpuGraph); - if (FAILED(Result)) { - InterlockedIncrement64( - &Context->GpuIsAcyclicButCpuIsCyclicFailures); - return Result; - } else { - InterlockedIncrement64(&Context->GpuAndCpuIsAcyclicSuccess); } } // - // Copy the Order[] array from the CPU graph. + // The GPU path now owns the peel order. Feed the captured Order[] into the + // CPU graph so Assign()/Verify() can act as an oracle during bring-up + // without recomputing IsAcyclic() on the CPU. // - ASSERT(SUCCEEDED(Result)); - CopyMemory(Graph->Order, - Graph->CpuGraph->Order, + CopyMemory(Graph->CpuGraph->Order, + Graph->Order, Info->OrderSizeInBytes); + Graph->Flags.IsAcyclic = TRUE; + Graph->DeletedEdgeCount = NumberOfKeys; + Graph->OrderIndex = 0; + + Graph->CpuGraph->Flags.IsAcyclic = TRUE; + Graph->CpuGraph->DeletedEdgeCount = NumberOfKeys; + + if (IsUsingAssigned16(Graph)) { + Graph->CpuGraph->Order16Index = 0; + } else { + Graph->CpuGraph->OrderIndex = 0; + } + + InterlockedIncrement64(&Context->GpuAndCpuIsAcyclicSuccess); + return Result; } @@ -1500,7 +1586,18 @@ GraphCuAssign( PRTL Rtl; HRESULT Result; + if (IsCudaDebugEnabled()) { + fprintf(stderr, + "[GraphCuAssign] Enter IsAcyclic=%u OrderIndex=%ld CpuOrderIndex=%ld\n", + (unsigned)Graph->Flags.IsAcyclic, + (long)Graph->OrderIndex, + (long)Graph->CpuGraph->OrderIndex); + } + Result = Graph->CpuGraph->Vtbl->Assign(Graph->CpuGraph); + if (IsCudaDebugEnabled()) { + fprintf(stderr, "[GraphCuAssign] CpuAssignResult=0x%08x\n", (unsigned)Result); + } if (FAILED(Result)) { return Result; } diff --git a/src/PerfectHashCuda/Graph.cu b/src/PerfectHashCuda/Graph.cu index 62709ff3..9dfbe758 100644 --- a/src/PerfectHashCuda/Graph.cu +++ b/src/PerfectHashCuda/Graph.cu @@ -205,7 +205,6 @@ GraphCuAddEdge1( PrevVertex3 = *Vertex; NextDegree = PrevVertex3.Degree + 1; - NextDegree += 1; NextEdges = PrevVertex3.Edges; NextEdges ^= Edge; @@ -808,9 +807,8 @@ GraphCuRemoveVertex( //DegreeType Degree; Edge3Type Edge3; Edge3Type *Edges3 = (decltype(Edges3))Graph->Edges3; - //OrderType *Order = (decltype(Order))Graph->Order; - //OrderType *OrderAddress; - //OrderIndexType OrderIndex; + OrderType *Order = (decltype(Order))Graph->Order; + OrderIndexType OrderIndex; OrderIndexType *GraphOrderIndex = (decltype(GraphOrderIndex))&Graph->OrderIndex; @@ -848,19 +846,11 @@ GraphCuRemoveVertex( if (Removed1 || Removed2) { AtomicAggIncCGV(&Graph->DeletedEdgeCount); - AtomicAggSubCGV(GraphOrderIndex); -#if 0 - OrderIndex = AtomicAggSubCG(GraphOrderIndex); - OrderAddress = &Order[OrderIndex]; + OrderIndex = atomicSub(GraphOrderIndex, 1) - 1; if (OrderIndex >= 0) { - ASSERT(*OrderAddress == 0); - ASSERT(Order[OrderIndex] == 0); - Order[OrderIndex] = Edge; - ASSERT(Order[OrderIndex] == Edge); - ASSERT(*OrderAddress == Edge); + Order[OrderIndex] = (OrderType)Edge; } -#endif } End: @@ -897,6 +887,75 @@ GraphCuIsAcyclicPhase1Kernel( return; } +template +GLOBAL +VOID +GraphCuIsAcyclicSerialKernel( + GraphType* Graph + ) +{ + using EdgeType = typename GraphType::EdgeType; + using Edge3Type = typename GraphType::Edge3Type; + using OrderType = typename GraphType::OrderType; + using VertexType = typename GraphType::VertexType; + using Vertex3Type = typename GraphType::Vertex3Type; + + if (blockIdx.x != 0 || threadIdx.x != 0) { + return; + } + + auto *Vertices3 = (Vertex3Type *)Graph->Vertices3; + auto *Edges3 = (Edge3Type *)Graph->Edges3; + auto *Order = (OrderType *)Graph->Order; + + Graph->Flags.Shrinking = TRUE; + + while (TRUE) { + bool Progress = false; + + for (uint32_t Index = 0; Index < Graph->NumberOfVertices; Index++) { + VertexType VertexIndex = (VertexType)Index; + Vertex3Type *Vertex = &Vertices3[VertexIndex]; + + if (Vertex->Degree != 1) { + continue; + } + + EdgeType Edge = Vertex->Edges; + Edge3Type *Edge3 = &Edges3[Edge]; + VertexType Vertex1 = Edge3->Vertex1; + VertexType Vertex2 = Edge3->Vertex2; + + if (Vertices3[VertexIndex].Degree != 1) { + continue; + } + + if (Vertices3[Vertex1].Degree > 0) { + --Vertices3[Vertex1].Degree; + Vertices3[Vertex1].Edges ^= Edge; + } + + if (Vertices3[Vertex2].Degree > 0) { + --Vertices3[Vertex2].Degree; + Vertices3[Vertex2].Edges ^= Edge; + } + + ++Graph->DeletedEdgeCount; + Graph->OrderIndex -= 1; + Order[Graph->OrderIndex] = (OrderType)Edge; + Progress = true; + } + + if (!Progress) { + break; + } + } + + if (Graph->DeletedEdgeCount == Graph->NumberOfKeys) { + Graph->Flags.IsAcyclic = TRUE; + } +} + EXTERN_C HOST HRESULT @@ -908,10 +967,8 @@ GraphCuIsAcyclic( ) { BOOLEAN IsAcyclic = FALSE; - ULONG Attempts = 0; - LONG OrderIndexDelta = 0; - LONG PreviousOrderIndex = 0; LONG DeviceOrderIndex = 0; + ULONG DeviceDeletedEdges = 0; HRESULT Result; PGRAPH DeviceGraph; @@ -919,28 +976,6 @@ GraphCuIsAcyclic( PPH_CU_SOLVE_CONTEXT SolveContext; ULONG SharedMemory = SharedMemoryInBytes; - // - // Get suitable launch parameters for the IsAcyclicPhase1() kernel. - // - - if (IsUsingAssigned16(Graph)) { - Result = GetKernelConfig(Graph, - (PVOID)GraphCuIsAcyclicPhase1Kernel, - BlocksPerGrid, - ThreadsPerBlock, - SharedMemory); - } else { - Result = GetKernelConfig(Graph, - (PVOID)GraphCuIsAcyclicPhase1Kernel, - BlocksPerGrid, - ThreadsPerBlock, - SharedMemory); - } - - if (FAILED(Result)) { - goto End; - } - // // Initialize aliases. // @@ -949,92 +984,44 @@ GraphCuIsAcyclic( DeviceGraph = SolveContext->DeviceGraph; Stream = (CUstream_st *)SolveContext->Stream; - // - // Enter the kernel launch loop. - // - - while (TRUE) { - - ++Attempts; - - // - // Dispatch the appropriate kernel and wait for completion. - // - - if (IsUsingAssigned16(Graph)) { - GraphCuIsAcyclicPhase1Kernel<<>>((PGRAPH16)DeviceGraph); - } else { - GraphCuIsAcyclicPhase1Kernel<<>>((PGRAPH32)DeviceGraph); - } - - CUDA_CALL(cudaMemcpyAsync((PVOID)&DeviceOrderIndex, - (PVOID)&DeviceGraph->OrderIndex, - sizeof(DeviceOrderIndex), - cudaMemcpyDeviceToHost, - Stream)); - - CUDA_CALL(cudaStreamSynchronize(Stream)); - - // - // Check to see if our OrderIndex has reached 0, this is indicative - // of an acyclic graph. (We use <= 0 because we may see -1 or 0 in - // some cases.) - // + BlocksPerGrid = 1; + ThreadsPerBlock = 1; + SharedMemory = 0; - if (DeviceOrderIndex <= 0) { - - // - // We were able to delete all vertices with degree 1, therefore, - // our graph is acyclic. - // - - IsAcyclic = TRUE; - break; - } - - // - // If this is our first pass, capture the OrderIndex as previous and - // continue. - // + if (IsUsingAssigned16(Graph)) { + GraphCuIsAcyclicSerialKernel<<>>((PGRAPH16)DeviceGraph); + } else { + GraphCuIsAcyclicSerialKernel<<>>((PGRAPH32)DeviceGraph); + } - if (Attempts == 1) { - PreviousOrderIndex = DeviceOrderIndex; - continue; - } + CUDA_CALL(cudaMemcpyAsync((PVOID)&DeviceOrderIndex, + (PVOID)&DeviceGraph->OrderIndex, + sizeof(DeviceOrderIndex), + cudaMemcpyDeviceToHost, + Stream)); - // - // Calculate the delta between the current OrderIndex and what we saw - // on the last pass. If they haven't changed, it means we weren't able - // to find any more vertices with degree 1 to delete, which means the - // graph isn't acyclic. - // + CUDA_CALL(cudaMemcpyAsync((PVOID)&DeviceDeletedEdges, + (PVOID)&DeviceGraph->DeletedEdgeCount, + sizeof(DeviceDeletedEdges), + cudaMemcpyDeviceToHost, + Stream)); - OrderIndexDelta = PreviousOrderIndex - DeviceOrderIndex; - ASSERT(OrderIndexDelta >= 0); - if (OrderIndexDelta == 0) { - break; - } + CUDA_CALL(cudaStreamSynchronize(Stream)); - // - // Update previous value and continue for another pass. - // + Graph->CuIsAcyclicPhase1Attempts = 1; + Graph->DeletedEdgeCount = DeviceDeletedEdges; + Graph->OrderIndex = DeviceOrderIndex; - PreviousOrderIndex = DeviceOrderIndex; + if (DeviceOrderIndex <= 0) { + IsAcyclic = TRUE; } - // - // Capture how many attempts were made to determine if the graph was - // acyclic. - // - - Graph->CuIsAcyclicPhase1Attempts = Attempts; - // // Make a note that we're acyclic if applicable in the graph's flags. // This is checked by GraphAssign() to ensure we only operate on acyclic From e5f269670b6e4cbbd2224ed11639db539d70e03c Mon Sep 17 00:00:00 2001 From: Trent Nelson Date: Sun, 29 Mar 2026 21:01:10 -0700 Subject: [PATCH 02/14] Chm02 CUDA: fix no-file-io completion path --- src/PerfectHash/Chm02.c | 98 ++++++++++++++++++- src/PerfectHash/Graph.c | 65 ++++++++++-- .../run_cli_chm02_cuda_known_seed_test.cmake | 46 +++++++++ 3 files changed, 198 insertions(+), 11 deletions(-) create mode 100644 tests/run_cli_chm02_cuda_known_seed_test.cmake diff --git a/src/PerfectHash/Chm02.c b/src/PerfectHash/Chm02.c index 8525a58b..0fda6299 100644 --- a/src/PerfectHash/Chm02.c +++ b/src/PerfectHash/Chm02.c @@ -17,6 +17,19 @@ Module Name: #include "Chm01.h" #include "Chm02Private.h" +FORCEINLINE +BOOLEAN +IsChm02CudaDebugEnabled( + VOID + ) +{ +#ifdef PH_WINDOWS + return (GetEnvironmentVariableA("PH_DEBUG_CUDA_CHM02", NULL, 0) > 0); +#else + return (getenv("PH_DEBUG_CUDA_CHM02") != NULL); +#endif +} + // // Main table creation implementation routine for Chm02. // @@ -129,6 +142,7 @@ Return Value: PPERFECT_HASH_CONTEXT Context; BOOL WaitForAllEvents = TRUE; PERFECT_HASH_TABLE_CREATE_FLAGS TableCreateFlags; + BOOLEAN DebugCuda; PPH_CU_DEVICE_CONTEXT DeviceContext; PPH_CU_DEVICE_CONTEXTS DeviceContexts; PPH_CU_SOLVE_CONTEXTS SolveContexts; @@ -182,6 +196,7 @@ Return Value: TableCreateFlags.AsULongLong = Table->TableCreateFlags.AsULongLong; Silent = (TableCreateFlags.Silent != FALSE); + DebugCuda = IsChm02CudaDebugEnabled(); // // Initialize variables used if we jump to Error early-on. @@ -517,6 +532,14 @@ Return Value: WaitForThreadpoolWorkCallbacks(Context->MainWork, TRUE); WaitForThreadpoolWorkCallbacks(Context->FinishedWork, FALSE); + if (DebugCuda) { + fprintf(stderr, + "[Chm02] Threadpool callbacks drained. FinishedCount=%lld " + "FirstSolvedWins=%u\n", + (long long)Context->FinishedCount, + (unsigned)FirstSolvedGraphWins(Context)); + } + Success = (Context->FinishedCount > 0); if (!Success && !CtrlCPressed) { @@ -634,6 +657,14 @@ Return Value: ASSERT(ListEntry); Graph = CONTAINING_RECORD(ListEntry, GRAPH, ListEntry); + if (DebugCuda) { + fprintf(stderr, + "[Chm02] Pulled winning graph from finished list: %p " + "IsCuGraph=%u\n", + Graph, + (unsigned)IsCuGraph(Graph)); + } + } else { EnterCriticalSection(&Context->BestGraphCriticalSection); @@ -644,6 +675,14 @@ Return Value: if (!Graph) { goto Error; } + + if (DebugCuda) { + fprintf(stderr, + "[Chm02] Selected best graph from context: %p " + "IsCuGraph=%u\n", + Graph, + (unsigned)IsCuGraph(Graph)); + } } // @@ -677,6 +716,15 @@ Return Value: Context->SolvedContext = Graph; + if (DebugCuda) { + fprintf(stderr, + "[Chm02] Context->SolvedContext set. MaxDepth=%u " + "EmptyVertices=%u Collisions=%u\n", + (unsigned)Graph->MaximumTraversalDepth, + (unsigned)Graph->NumberOfEmptyVertices, + (unsigned)Graph->Collisions); + } + // // Graphs always pass verification in normal circumstances. The only time // they don't is if there's an internal bug in our code. So, knowing that @@ -695,6 +743,15 @@ Return Value: } else { + if (DebugCuda) { + fprintf(stderr, + "[Chm02] NoFileIo path entered. IsCuGraph=%u " + "AssignedElementSize=%u NumberOfTableElements=%llu\n", + (unsigned)IsCuGraph(Graph), + (unsigned)TableInfoOnDisk->AssignedElementSizeInBytes, + (unsigned long long)TableInfoOnDisk->NumberOfTableElements.QuadPart); + } + PGRAPH_INFO_ON_DISK NewGraphInfoOnDisk; // @@ -727,6 +784,10 @@ Return Value: GraphInfoOnDisk, sizeof(*NewGraphInfoOnDisk)); + if (DebugCuda) { + fprintf(stderr, "[Chm02] Copied GraphInfoOnDisk.\n"); + } + // // Sanity check the first seed is not 0. // @@ -745,6 +806,13 @@ Return Value: &Graph->FirstSeed, Graph->NumberOfSeeds * sizeof(Graph->FirstSeed)); + if (DebugCuda) { + fprintf(stderr, + "[Chm02] Copied seeds. FirstSeed=%u NumberOfSeeds=%u\n", + (unsigned)Graph->FirstSeed, + (unsigned)Graph->NumberOfSeeds); + } + // // Switch the pointers. // @@ -783,7 +851,7 @@ Return Value: SizeInBytes = ( TableInfoOnDisk->NumberOfTableElements.QuadPart * - TableInfoOnDisk->KeySizeInBytes + TableInfoOnDisk->AssignedElementSizeInBytes ); TryLargePageVirtualAlloc = Rtl->Vtbl->TryLargePageVirtualAlloc; @@ -795,6 +863,15 @@ Return Value: &LargePagesForTableData); Table->TableDataBaseAddress = BaseAddress; + Table->TableDataSizeInBytes = SizeInBytes; + + if (DebugCuda) { + fprintf(stderr, + "[Chm02] Allocated table data SizeInBytes=%lld " + "BaseAddress=%p\n", + (long long)SizeInBytes, + BaseAddress); + } if (!BaseAddress) { Result = E_OUTOFMEMORY; @@ -821,6 +898,10 @@ Return Value: Graph->Assigned, SizeInBytes); + if (DebugCuda) { + fprintf(stderr, "[Chm02] Copied table data from Graph->Assigned.\n"); + } + } } @@ -832,10 +913,18 @@ Return Value: CONTEXT_START_TIMERS(Verify); + if (DebugCuda) { + fprintf(stderr, "[Chm02] About to call Graph->Verify().\n"); + } + Result = Graph->Vtbl->Verify(Graph); CONTEXT_END_TIMERS(Verify); + if (DebugCuda) { + fprintf(stderr, "[Chm02] Verify result=0x%08x\n", (unsigned)Result); + } + // // Set the verified table event (regardless of whether or not we succeeded // in verification). The save file work will be waiting upon it in order to @@ -848,12 +937,19 @@ Return Value: goto Error; } + if (DebugCuda) { + fprintf(stderr, "[Chm02] VerifiedTableEvent signaled.\n"); + } + if (FAILED(Result)) { Result = PH_E_TABLE_VERIFICATION_FAILED; goto Error; } if (NoFileIo(Table)) { + if (DebugCuda) { + fprintf(stderr, "[Chm02] NoFileIo exit after verify.\n"); + } goto End; } diff --git a/src/PerfectHash/Graph.c b/src/PerfectHash/Graph.c index 9bb8d30b..bd35725b 100644 --- a/src/PerfectHash/Graph.c +++ b/src/PerfectHash/Graph.c @@ -3864,15 +3864,20 @@ Return Value: ASSERT(Context->NewBestGraphCount == 0); ASSERT(Context->FirstAttemptSolved == 0); SpareGraph = Context->SpareGraph; - ASSERT(SpareGraph != NULL); - ASSERT(IsSpareGraph(SpareGraph)); - SpareGraph->Flags.IsSpare = FALSE; - Context->SpareGraph = NULL; Context->BestGraph = Graph; FoundBestGraph = TRUE; Coverage->BestGraphNumber = ++Context->NewBestGraphCount; Context->FirstAttemptSolved = Graph->Attempt; - Result = PH_S_USE_NEW_GRAPH_FOR_SOLVING; + + if (SpareGraph != NULL) { + ASSERT(IsSpareGraph(SpareGraph)); + SpareGraph->Flags.IsSpare = FALSE; + Context->SpareGraph = NULL; + Result = PH_S_USE_NEW_GRAPH_FOR_SOLVING; + } else { + StopGraphSolving = TRUE; + Result = PH_S_STOP_GRAPH_SOLVING; + } } // @@ -6959,6 +6964,7 @@ Return Value: PPERFECT_HASH_CONTEXT Context; PASSIGNED16_MEMORY_COVERAGE Coverage; PERFECT_HASH_TABLE_BEST_COVERAGE_TYPE_ID CoverageType; + BOOLEAN DebugCudaSolve; // // Initialize aliases. @@ -6969,6 +6975,22 @@ Return Value: CoverageType = Context->BestCoverageType; Attempt = Coverage->Attempt; ElapsedMilliseconds = GetTickCount64() - Context->StartMilliseconds; + DebugCudaSolve = FALSE; + +#ifdef PH_WINDOWS + DebugCudaSolve = (GetEnvironmentVariableA("PH_DEBUG_CUDA_CHM02", NULL, 0) > 0); +#else + DebugCudaSolve = (getenv("PH_DEBUG_CUDA_CHM02") != NULL); +#endif + + if (DebugCudaSolve && IsCuGraph(Graph)) { + fprintf(stderr, + "[GraphRegisterSolved16NoBestCoverage] Enter " + "BestGraph=%p SpareGraph=%p SolutionNumber=%llu\n", + Context->BestGraph, + Context->SpareGraph, + (unsigned long long)Graph->SolutionNumber); + } // // Indicate continue graph solving by default. @@ -7002,15 +7024,19 @@ Return Value: ASSERT(Context->NewBestGraphCount == 0); ASSERT(Context->FirstAttemptSolved == 0); SpareGraph = Context->SpareGraph; - ASSERT(SpareGraph != NULL); - ASSERT(IsSpareGraph(SpareGraph)); - SpareGraph->Flags.IsSpare = FALSE; - Context->SpareGraph = NULL; Context->BestGraph = Graph; FoundBestGraph = TRUE; Coverage->BestGraphNumber = ++Context->NewBestGraphCount; Context->FirstAttemptSolved = Graph->Attempt; - Result = PH_S_USE_NEW_GRAPH_FOR_SOLVING; + if (SpareGraph != NULL) { + ASSERT(IsSpareGraph(SpareGraph)); + SpareGraph->Flags.IsSpare = FALSE; + Context->SpareGraph = NULL; + Result = PH_S_USE_NEW_GRAPH_FOR_SOLVING; + } else { + StopGraphSolving = TRUE; + Result = PH_S_STOP_GRAPH_SOLVING; + } } // @@ -7019,6 +7045,16 @@ Return Value: LeaveCriticalSection(&Context->BestGraphCriticalSection); + if (DebugCudaSolve && IsCuGraph(Graph)) { + fprintf(stderr, + "[GraphRegisterSolved16NoBestCoverage] AfterCS " + "FoundBestGraph=%u Result=0x%08x SpareGraph=%p BestGraph=%p\n", + (unsigned)FoundBestGraph, + (unsigned)Result, + SpareGraph, + Context->BestGraph); + } + End: if (Context->TargetNumberOfSolutions > 0) { @@ -7048,6 +7084,15 @@ Return Value: *NewGraphPointer = SpareGraph; } + if (DebugCudaSolve && IsCuGraph(Graph)) { + fprintf(stderr, + "[GraphRegisterSolved16NoBestCoverage] Exit Result=0x%08x " + "NewGraph=%p Stop=%u\n", + (unsigned)Result, + *NewGraphPointer, + (unsigned)StopGraphSolving); + } + EVENT_WRITE_GRAPH_FOUND(Found); return Result; diff --git a/tests/run_cli_chm02_cuda_known_seed_test.cmake b/tests/run_cli_chm02_cuda_known_seed_test.cmake new file mode 100644 index 00000000..2426b3f5 --- /dev/null +++ b/tests/run_cli_chm02_cuda_known_seed_test.cmake @@ -0,0 +1,46 @@ +if(NOT DEFINED TEST_EXE) + message(FATAL_ERROR "TEST_EXE is required") +endif() +if(NOT DEFINED TEST_KEYS) + message(FATAL_ERROR "TEST_KEYS is required") +endif() +if(NOT DEFINED TEST_OUTPUT) + message(FATAL_ERROR "TEST_OUTPUT is required") +endif() + +file(TO_NATIVE_PATH "${TEST_EXE}" test_exe_native) +file(TO_NATIVE_PATH "${TEST_KEYS}" test_keys_native) +file(TO_NATIVE_PATH "${TEST_OUTPUT}" test_output_native) + +file(MAKE_DIRECTORY "${test_output_native}") + +set(args + "Chm02" + "Mulshrolate3RX" + "And" + "1" + "--CuConcurrency=1" + "--FixedAttempts=2" + "--Seeds=0xF0192B55,0xD9C83970,0x0C1E0D10,0xD11A5847" + "--NoFileIo" + "--DisableCsvOutputFile" +) + +execute_process( + COMMAND "${test_exe_native}" "${test_keys_native}" "${test_output_native}" ${args} + RESULT_VARIABLE result + OUTPUT_VARIABLE stdout + ERROR_VARIABLE stderr +) + +message(STATUS "stdout: ${stdout}") +message(STATUS "stderr: ${stderr}") + +if(NOT result EQUAL 0) + message(FATAL_ERROR "Command failed with exit code ${result}") +endif() + +string(FIND "${stderr}" "PerfectHashTableCreate failed" failure_index) +if(NOT failure_index EQUAL -1) + message(FATAL_ERROR "Expected Chm02 CUDA known-seed run to succeed, but stderr reported failure.") +endif() From 1425eaa0ae970ee9c8e10279df695d026ea4c481 Mon Sep 17 00:00:00 2001 From: Trent Nelson Date: Sun, 29 Mar 2026 21:01:35 -0700 Subject: [PATCH 03/14] Chm02 CUDA: add GPU assignment and verify path --- src/PerfectHash/Cu.h | 12 ++ src/PerfectHash/GraphCu.c | 17 ++- src/PerfectHashCuda/Graph.cu | 121 +++++++++++++++++- src/PerfectHashCuda/PerfectHashCuda.def | 1 + .../run_cli_chm02_cuda_known_seed_test.cmake | 15 ++- 5 files changed, 157 insertions(+), 9 deletions(-) diff --git a/src/PerfectHash/Cu.h b/src/PerfectHash/Cu.h index f86dbaac..8b696653 100644 --- a/src/PerfectHash/Cu.h +++ b/src/PerfectHash/Cu.h @@ -2113,6 +2113,16 @@ HRESULT ); typedef PERFECT_HASH_CUDA_ADD_KEYS *PPERFECT_HASH_CUDA_ADD_KEYS; +typedef +HRESULT +(PERFECT_HASH_CUDA_ASSIGN)( + _Inout_ struct _GRAPH *Graph, + _In_ ULONG BlocksPerGrid, + _In_ ULONG ThreadsPerBlock, + _In_ ULONG SharedMemoryInBytes + ); +typedef PERFECT_HASH_CUDA_ASSIGN *PPERFECT_HASH_CUDA_ASSIGN; + typedef HRESULT (PERFECT_HASH_CUDA_IS_GRAPH_ACYCLIC)( @@ -2142,6 +2152,8 @@ typedef PERFECT_HASH_CUDA_COUNT_NONEMPTY \ FIRST_ENTRY(ADD_KEYS, AddKeys) \ \ + ENTRY(ASSIGN, Assign) \ + \ LAST_ENTRY(IS_GRAPH_ACYCLIC, IsAcyclic) #define PERFECT_HASH_CUDA_FUNCTION_TABLE_ENTRY(ENTRY) \ diff --git a/src/PerfectHash/GraphCu.c b/src/PerfectHash/GraphCu.c index 4ec65ab2..c7cb8c0a 100644 --- a/src/PerfectHash/GraphCu.c +++ b/src/PerfectHash/GraphCu.c @@ -1583,7 +1583,7 @@ GraphCuAssign( _In_ PGRAPH Graph ) { - PRTL Rtl; + PCU Cu; HRESULT Result; if (IsCudaDebugEnabled()) { @@ -1594,17 +1594,22 @@ GraphCuAssign( (long)Graph->CpuGraph->OrderIndex); } - Result = Graph->CpuGraph->Vtbl->Assign(Graph->CpuGraph); + Cu = Graph->CuSolveContext->DeviceContext->Cu; + + Result = Cu->Assign(Graph, + Graph->CuBlocksPerGrid, + Graph->CuThreadsPerBlock, + Graph->CuSharedMemory); + if (IsCudaDebugEnabled()) { - fprintf(stderr, "[GraphCuAssign] CpuAssignResult=0x%08x\n", (unsigned)Result); + fprintf(stderr, "[GraphCuAssign] GpuAssignResult=0x%08x\n", (unsigned)Result); } if (FAILED(Result)) { return Result; } - Rtl = Graph->Context->Rtl; - CopyMemory(Graph->Assigned, - Graph->CpuGraph->Assigned, + CopyMemory(Graph->CpuGraph->Assigned, + Graph->Assigned, Graph->Info->AssignedSizeInBytes); return Result; diff --git a/src/PerfectHashCuda/Graph.cu b/src/PerfectHashCuda/Graph.cu index 9dfbe758..86211ea9 100644 --- a/src/PerfectHashCuda/Graph.cu +++ b/src/PerfectHashCuda/Graph.cu @@ -956,6 +956,87 @@ GraphCuIsAcyclicSerialKernel( } } +template +DEVICE +bool +GraphCuIsVisitedVertex( + _In_ GraphType *Graph, + _In_ typename GraphType::VertexType Vertex + ) +{ + auto *Bitmap = (uint64_t *)Graph->VisitedVerticesBitmap.Buffer; + const uint64_t Mask = (1ull << ((uint64_t)Vertex & 63ull)); + return ((Bitmap[(uint64_t)Vertex >> 6] & Mask) != 0); +} + +template +DEVICE +void +GraphCuRegisterVertexVisit( + _In_ GraphType *Graph, + _In_ typename GraphType::VertexType Vertex + ) +{ + auto *Bitmap = (uint64_t *)Graph->VisitedVerticesBitmap.Buffer; + const uint64_t Mask = (1ull << ((uint64_t)Vertex & 63ull)); + Bitmap[(uint64_t)Vertex >> 6] |= Mask; +} + +template +GLOBAL +VOID +GraphCuAssignSerialKernel( + _In_ GraphType *Graph + ) +{ + using AssignedType = typename GraphType::AssignedType; + using Edge3Type = typename GraphType::Edge3Type; + using EdgeType = typename GraphType::EdgeType; + using OrderType = typename GraphType::OrderType; + using VertexType = typename GraphType::VertexType; + + if (blockIdx.x != 0 || threadIdx.x != 0) { + return; + } + + auto *Assigned = (AssignedType *)Graph->Assigned; + auto *Edges3 = (Edge3Type *)Graph->Edges3; + auto *Order = (OrderType *)Graph->Order; + const EdgeType NumberOfEdges = (EdgeType)Graph->NumberOfEdges; + + ASSERT(Graph->Flags.IsAcyclic); + + for (uint32_t Index = (uint32_t)Graph->OrderIndex; + Index < Graph->NumberOfKeys; + Index++) + { + const EdgeType Edge = (EdgeType)Order[Index]; + const Edge3Type *Edge3 = &Edges3[Edge]; + VertexType Vertex1; + VertexType Vertex2; + AssignedType Value; + + if (!GraphCuIsVisitedVertex(Graph, Edge3->Vertex1)) { + Vertex1 = Edge3->Vertex1; + Vertex2 = Edge3->Vertex2; + } else { + Vertex1 = Edge3->Vertex2; + Vertex2 = Edge3->Vertex1; + } + + Value = (AssignedType)(Edge - Assigned[Vertex2]); + if (Value >= NumberOfEdges) { + Value = (AssignedType)(Value + NumberOfEdges); + } + + ASSERT(Assigned[Vertex1] == INITIAL_ASSIGNMENT_VALUE); + Assigned[Vertex1] = Value; + + GraphCuRegisterVertexVisit(Graph, Vertex1); + GraphCuRegisterVertexVisit(Graph, Vertex2); + } +} + EXTERN_C HOST HRESULT @@ -970,7 +1051,6 @@ GraphCuIsAcyclic( LONG DeviceOrderIndex = 0; ULONG DeviceDeletedEdges = 0; - HRESULT Result; PGRAPH DeviceGraph; CUstream_st* Stream; PPH_CU_SOLVE_CONTEXT SolveContext; @@ -1065,8 +1145,45 @@ GraphCuIsAcyclic( } } -End: return (IsAcyclic ? S_OK : PH_E_GRAPH_CYCLIC_FAILURE); } +EXTERN_C +HOST +HRESULT +GraphCuAssign( + _In_ PGRAPH Graph, + _In_ ULONG BlocksPerGrid, + _In_ ULONG ThreadsPerBlock, + _In_ ULONG SharedMemoryInBytes + ) +{ + HRESULT Result = S_OK; + PGRAPH DeviceGraph; + CUstream_st *Stream; + PPH_CU_SOLVE_CONTEXT SolveContext; + + (void)BlocksPerGrid; + (void)ThreadsPerBlock; + (void)SharedMemoryInBytes; + + SolveContext = Graph->CuSolveContext; + DeviceGraph = SolveContext->DeviceGraph; + Stream = (CUstream_st *)SolveContext->Stream; + + if (IsUsingAssigned16(Graph)) { + GraphCuAssignSerialKernel<<<1, 1, 0, Stream>>>( + (PGRAPH16)DeviceGraph + ); + } else { + GraphCuAssignSerialKernel<<<1, 1, 0, Stream>>>( + (PGRAPH32)DeviceGraph + ); + } + + CUDA_CALL(cudaStreamSynchronize(Stream)); + + return Result; +} + // vim:set ts=8 sw=4 sts=4 tw=80 expandtab filetype=cuda formatoptions=croql : diff --git a/src/PerfectHashCuda/PerfectHashCuda.def b/src/PerfectHashCuda/PerfectHashCuda.def index dd6b2c58..4b85ae06 100644 --- a/src/PerfectHashCuda/PerfectHashCuda.def +++ b/src/PerfectHashCuda/PerfectHashCuda.def @@ -2,3 +2,4 @@ LIBRARY PerfectHashCuda EXPORTS GraphCuIsAcyclic GraphCuAddKeys + GraphCuAssign diff --git a/tests/run_cli_chm02_cuda_known_seed_test.cmake b/tests/run_cli_chm02_cuda_known_seed_test.cmake index 2426b3f5..ecda4c4d 100644 --- a/tests/run_cli_chm02_cuda_known_seed_test.cmake +++ b/tests/run_cli_chm02_cuda_known_seed_test.cmake @@ -27,7 +27,8 @@ set(args ) execute_process( - COMMAND "${test_exe_native}" "${test_keys_native}" "${test_output_native}" ${args} + COMMAND ${CMAKE_COMMAND} -E env PH_DEBUG_CUDA_CHM02=1 + "${test_exe_native}" "${test_keys_native}" "${test_output_native}" ${args} RESULT_VARIABLE result OUTPUT_VARIABLE stdout ERROR_VARIABLE stderr @@ -44,3 +45,15 @@ string(FIND "${stderr}" "PerfectHashTableCreate failed" failure_index) if(NOT failure_index EQUAL -1) message(FATAL_ERROR "Expected Chm02 CUDA known-seed run to succeed, but stderr reported failure.") endif() + +if(DEFINED REQUIRE_GPU_ASSIGN AND REQUIRE_GPU_ASSIGN) + string(FIND "${stderr}" "[GraphCuAssign] GpuAssignResult=0x00000000" gpu_assign_index) + if(gpu_assign_index EQUAL -1) + message(FATAL_ERROR "Expected GPU assignment success log, but it was not present.") + endif() + + string(FIND "${stderr}" "[GraphCuAssign] CpuAssignResult=" cpu_assign_index) + if(NOT cpu_assign_index EQUAL -1) + message(FATAL_ERROR "Expected GPU assignment path without CPU assign fallback, but CPU assign log was present.") + endif() +endif() From 0d099766234e3a883956f5cbfbeeb206b06942f3 Mon Sep 17 00:00:00 2001 From: Trent Nelson Date: Sun, 29 Mar 2026 21:01:57 -0700 Subject: [PATCH 04/14] Chm02 CUDA: fix Linux compat file-work path --- src/PerfectHash/Chm01FileWork.c | 7 ++- src/PerfectHash/Chm01FileWorkStub.c | 4 +- src/PerfectHash/Chm02Compat.c | 2 + .../run_cli_chm02_cuda_known_seed_test.cmake | 59 ++++++++++++++++--- 4 files changed, 58 insertions(+), 14 deletions(-) diff --git a/src/PerfectHash/Chm01FileWork.c b/src/PerfectHash/Chm01FileWork.c index 3c57546a..7a3c9a2f 100644 --- a/src/PerfectHash/Chm01FileWork.c +++ b/src/PerfectHash/Chm01FileWork.c @@ -54,7 +54,6 @@ PERFECT_HASH_FILE_WORK_ITEM_CALLBACK FileWorkItemCallbackChm01; // Begin method implementations. // -#ifdef PH_WINDOWS PERFECT_HASH_FILE_WORK_CALLBACK FileWorkCallbackChm01; _Use_decl_annotations_ @@ -88,13 +87,17 @@ Return Value: { PFILE_WORK_ITEM Item; + if (!ARGUMENT_PRESENT(ListEntry)) { + return; + } + Item = CONTAINING_RECORD(ListEntry, FILE_WORK_ITEM, ListEntry); + Item->Instance = Instance; Item->Context = Context; FileWorkItemCallbackChm01(Item); } -#endif _Use_decl_annotations_ VOID diff --git a/src/PerfectHash/Chm01FileWorkStub.c b/src/PerfectHash/Chm01FileWorkStub.c index e9ea6f81..b5ab2f5d 100644 --- a/src/PerfectHash/Chm01FileWorkStub.c +++ b/src/PerfectHash/Chm01FileWorkStub.c @@ -16,7 +16,6 @@ Module Name: #ifdef PH_ONLINE_ONLY -#ifdef PH_WINDOWS PERFECT_HASH_FILE_WORK_ITEM_CALLBACK FileWorkItemCallbackChm01; PERFECT_HASH_FILE_WORK_CALLBACK FileWorkCallbackChm01; @@ -41,7 +40,6 @@ FileWorkCallbackChm01( FileWorkItemCallbackChm01(Item); } -#endif _Use_decl_annotations_ VOID @@ -60,4 +58,4 @@ FileWorkItemCallbackChm01( #endif // PH_ONLINE_ONLY -// vim:set ts=8 sw=4 sts=4 tw=80 expandtab : \ No newline at end of file +// vim:set ts=8 sw=4 sts=4 tw=80 expandtab : diff --git a/src/PerfectHash/Chm02Compat.c b/src/PerfectHash/Chm02Compat.c index 97302eeb..67c61732 100644 --- a/src/PerfectHash/Chm02Compat.c +++ b/src/PerfectHash/Chm02Compat.c @@ -375,6 +375,7 @@ Return Value: ASSERT(!NoFileIo(Table)); \ ZeroStructInline(Verb##Name); \ Verb##Name.FileWorkId = FileWork##Verb##Name##Id; \ + Verb##Name.Context = Context; \ ThreadpoolAddWork(FileWorkThreadpool, \ FileWorkItemCallbackChm01, \ &Verb##Name); @@ -1010,6 +1011,7 @@ Return Value: ASSERT(!NoFileIo(Table)); \ ZeroStructInline(Verb##Name); \ Verb##Name.FileWorkId = FileWork##Verb##Name##Id; \ + Verb##Name.Context = Context; \ Verb##Name.EndOfFile = EndOfFile; \ ThreadpoolAddWork(FileWorkThreadpool, \ FileWorkItemCallbackChm01, \ diff --git a/tests/run_cli_chm02_cuda_known_seed_test.cmake b/tests/run_cli_chm02_cuda_known_seed_test.cmake index ecda4c4d..ec60a144 100644 --- a/tests/run_cli_chm02_cuda_known_seed_test.cmake +++ b/tests/run_cli_chm02_cuda_known_seed_test.cmake @@ -14,18 +14,59 @@ file(TO_NATIVE_PATH "${TEST_OUTPUT}" test_output_native) file(MAKE_DIRECTORY "${test_output_native}") +set(test_algorithm "Chm02") +if(DEFINED TEST_ALGORITHM) + set(test_algorithm "${TEST_ALGORITHM}") +endif() + +set(test_hash "Mulshrolate3RX") +if(DEFINED TEST_HASH) + set(test_hash "${TEST_HASH}") +endif() + +set(test_mask "And") +if(DEFINED TEST_MASK) + set(test_mask "${TEST_MASK}") +endif() + +set(test_concurrency "1") +if(DEFINED TEST_CONCURRENCY) + set(test_concurrency "${TEST_CONCURRENCY}") +endif() + +set(test_cu_concurrency "--CuConcurrency=1") +if(DEFINED TEST_CU_CONCURRENCY) + set(test_cu_concurrency "${TEST_CU_CONCURRENCY}") +endif() + +set(test_fixed_attempts "--FixedAttempts=2") +if(DEFINED TEST_FIXED_ATTEMPTS) + set(test_fixed_attempts "${TEST_FIXED_ATTEMPTS}") +endif() + +set(test_seeds "--Seeds=0xF0192B55,0xD9C83970,0x0C1E0D10,0xD11A5847") +if(DEFINED TEST_SEEDS) + set(test_seeds "${TEST_SEEDS}") +endif() + set(args - "Chm02" - "Mulshrolate3RX" - "And" - "1" - "--CuConcurrency=1" - "--FixedAttempts=2" - "--Seeds=0xF0192B55,0xD9C83970,0x0C1E0D10,0xD11A5847" - "--NoFileIo" - "--DisableCsvOutputFile" + "${test_algorithm}" + "${test_hash}" + "${test_mask}" + "${test_concurrency}" + "${test_cu_concurrency}" + "${test_fixed_attempts}" + "${test_seeds}" ) +if(DEFINED TEST_FLAGS) + foreach(flag IN LISTS TEST_FLAGS) + list(APPEND args "${flag}") + endforeach() +else() + list(APPEND args "--NoFileIo" "--DisableCsvOutputFile") +endif() + execute_process( COMMAND ${CMAKE_COMMAND} -E env PH_DEBUG_CUDA_CHM02=1 "${test_exe_native}" "${test_keys_native}" "${test_output_native}" ${args} From 9daa95787ebae9d7e2e96851d755d1a680cf7bd0 Mon Sep 17 00:00:00 2001 From: Trent Nelson Date: Sun, 29 Mar 2026 21:02:18 -0700 Subject: [PATCH 05/14] Chm02 CUDA: validate peel order against CPU oracle --- src/PerfectHash/GraphCu.c | 234 ++++++++++++++++++ .../run_cli_chm02_cuda_known_seed_test.cmake | 7 + 2 files changed, 241 insertions(+) diff --git a/src/PerfectHash/GraphCu.c b/src/PerfectHash/GraphCu.c index c7cb8c0a..ac038ec9 100644 --- a/src/PerfectHash/GraphCu.c +++ b/src/PerfectHash/GraphCu.c @@ -1330,6 +1330,197 @@ IsCudaDebugEnabled( #endif } +#define DEFINE_VALIDATE_GPU_ORDER(Name, \ + OrderType, \ + EdgeType, \ + Vertex3Type, \ + Edge3Type, \ + OrderField, \ + VerticesField, \ + EdgesField) \ +static \ +HRESULT \ +Name( \ + _In_ PGRAPH Graph, \ + _Out_opt_ PULONG InvalidIndexPointer, \ + _Out_opt_ PLONG InvalidEdgePointer, \ + _Out_opt_ PULONG Degree1Pointer, \ + _Out_opt_ PULONG Degree2Pointer, \ + _Out_opt_ PCSTR *ReasonPointer \ + ) \ +{ \ + LONG SignedEdge; \ + ULONG Index; \ + ULONG NumberOfKeys; \ + ULONG NumberOfVertices; \ + PGRAPH CpuGraph; \ + PBYTE Seen = NULL; \ + HRESULT Result = S_OK; \ + OrderType *Order; \ + EdgeType Edge; \ + Edge3Type *Edge3 = NULL; \ + Edge3Type *Edges; \ + Vertex3Type *Vertex1 = NULL; \ + Vertex3Type *Vertex2 = NULL; \ + Vertex3Type *ScratchVertices = NULL; \ + \ + CpuGraph = Graph->CpuGraph; \ + NumberOfKeys = CpuGraph->NumberOfKeys; \ + NumberOfVertices = CpuGraph->NumberOfVertices; \ + Order = (OrderType *)Graph->OrderField; \ + Edges = (Edge3Type *)CpuGraph->EdgesField; \ + \ + if (InvalidIndexPointer) { \ + *InvalidIndexPointer = (ULONG)-1; \ + } \ + if (InvalidEdgePointer) { \ + *InvalidEdgePointer = -1; \ + } \ + if (Degree1Pointer) { \ + *Degree1Pointer = 0; \ + } \ + if (Degree2Pointer) { \ + *Degree2Pointer = 0; \ + } \ + if (ReasonPointer) { \ + *ReasonPointer = "unknown"; \ + } \ + \ + ScratchVertices = (Vertex3Type *)( \ + calloc(NumberOfVertices, sizeof(*ScratchVertices)) \ + ); \ + Seen = (PBYTE)calloc(NumberOfKeys, sizeof(*Seen)); \ + \ + if (!ScratchVertices || !Seen) { \ + Result = E_OUTOFMEMORY; \ + if (ReasonPointer) { \ + *ReasonPointer = "oom"; \ + } \ + goto End; \ + } \ + \ + CopyMemory(ScratchVertices, \ + CpuGraph->VerticesField, \ + NumberOfVertices * sizeof(*ScratchVertices)); \ + \ + for (Index = NumberOfKeys; Index > 0; Index--) { \ + ULONG OrderIndex; \ + \ + OrderIndex = Index - 1; \ + \ + SignedEdge = (LONG)Order[OrderIndex]; \ + \ + if (SignedEdge < 0 || (ULONG)SignedEdge >= NumberOfKeys) { \ + Result = PH_E_INVARIANT_CHECK_FAILED; \ + if (ReasonPointer) { \ + *ReasonPointer = "out_of_range"; \ + } \ + goto Invalid; \ + } \ + \ + Edge = (EdgeType)SignedEdge; \ + \ + if (Seen[Edge]) { \ + Result = PH_E_INVARIANT_CHECK_FAILED; \ + if (ReasonPointer) { \ + *ReasonPointer = "duplicate_edge"; \ + } \ + goto Invalid; \ + } \ + \ + Seen[Edge] = TRUE; \ + Edge3 = &Edges[Edge]; \ + Vertex1 = &ScratchVertices[Edge3->Vertex1]; \ + Vertex2 = &ScratchVertices[Edge3->Vertex2]; \ + \ + if (Vertex1->Degree == 0 || Vertex2->Degree == 0) { \ + Result = PH_E_INVARIANT_CHECK_FAILED; \ + if (ReasonPointer) { \ + *ReasonPointer = "edge_already_removed"; \ + } \ + goto Invalid; \ + } \ + \ + if (Vertex1->Degree != 1 && Vertex2->Degree != 1) { \ + Result = PH_E_INVARIANT_CHECK_FAILED; \ + if (ReasonPointer) { \ + *ReasonPointer = "no_degree1_endpoint"; \ + } \ + goto Invalid; \ + } \ + \ + Vertex1->Degree -= 1; \ + Vertex1->Edges ^= Edge; \ + Vertex2->Degree -= 1; \ + Vertex2->Edges ^= Edge; \ + } \ + \ + for (Index = 0; Index < NumberOfVertices; Index++) { \ + if (ScratchVertices[Index].Degree != 0) { \ + Result = PH_E_INVARIANT_CHECK_FAILED; \ + if (ReasonPointer) { \ + *ReasonPointer = "residual_degree"; \ + } \ + if (InvalidIndexPointer) { \ + *InvalidIndexPointer = Index; \ + } \ + if (Degree1Pointer) { \ + *Degree1Pointer = ScratchVertices[Index].Degree; \ + } \ + if (InvalidEdgePointer) { \ + *InvalidEdgePointer = -1; \ + } \ + goto End; \ + } \ + } \ + \ + if (ReasonPointer) { \ + *ReasonPointer = "ok"; \ + } \ + goto End; \ + \ +Invalid: \ + if (InvalidIndexPointer) { \ + *InvalidIndexPointer = (Index > 0 ? Index - 1 : 0); \ + } \ + if (InvalidEdgePointer) { \ + *InvalidEdgePointer = SignedEdge; \ + } \ + if (Degree1Pointer) { \ + *Degree1Pointer = Vertex1 ? (ULONG)Vertex1->Degree : 0; \ + } \ + if (Degree2Pointer) { \ + *Degree2Pointer = Vertex2 ? (ULONG)Vertex2->Degree : 0; \ + } \ + \ +End: \ + if (Seen) { \ + free(Seen); \ + } \ + if (ScratchVertices) { \ + free(ScratchVertices); \ + } \ + return Result; \ +} + +DEFINE_VALIDATE_GPU_ORDER(ValidateGpuOrder16, + ORDER16, + EDGE16, + VERTEX163, + EDGE163, + Order16, + Vertices163, + Edges163); + +DEFINE_VALIDATE_GPU_ORDER(ValidateGpuOrder32, + ORDER, + EDGE, + VERTEX3, + EDGE3, + Order, + Vertices3, + Edges3); + HRESULT GraphCuAddKeys( _In_ PGRAPH Graph, @@ -1487,9 +1678,14 @@ GraphCuIsAcyclic( if (IsCudaDebugEnabled()) { HRESULT CpuAcyclicResult; + HRESULT GpuOrderValidationResult; + LONG InvalidEdge; + ULONG Degree1; + ULONG Degree2; ULONG MismatchIndex; ULONG CpuOrder; ULONG GpuOrder; + PCSTR Reason; CpuAcyclicResult = Graph->CpuGraph->Vtbl->IsAcyclic(Graph->CpuGraph); fprintf(stderr, @@ -1548,6 +1744,44 @@ GraphCuIsAcyclic( if (FAILED(Result)) { return Result; } + + MismatchIndex = (ULONG)-1; + InvalidEdge = -1; + Degree1 = 0; + Degree2 = 0; + Reason = "unknown"; + + if (IsUsingAssigned16(Graph)) { + GpuOrderValidationResult = ValidateGpuOrder16(Graph, + &MismatchIndex, + &InvalidEdge, + &Degree1, + &Degree2, + &Reason); + } else { + GpuOrderValidationResult = ValidateGpuOrder32(Graph, + &MismatchIndex, + &InvalidEdge, + &Degree1, + &Degree2, + &Reason); + } + + if (SUCCEEDED(GpuOrderValidationResult)) { + fprintf(stderr, + "[GraphCuIsAcyclic] GpuOrderValidationResult=0x%08x\n", + (unsigned)GpuOrderValidationResult); + } else { + fprintf(stderr, + "[GraphCuIsAcyclic] GpuOrderValidationResult=0x%08x " + "reason=%s index=%u edge=%ld degree1=%u degree2=%u\n", + (unsigned)GpuOrderValidationResult, + Reason, + (unsigned)MismatchIndex, + (long)InvalidEdge, + (unsigned)Degree1, + (unsigned)Degree2); + } } // diff --git a/tests/run_cli_chm02_cuda_known_seed_test.cmake b/tests/run_cli_chm02_cuda_known_seed_test.cmake index ec60a144..347319a2 100644 --- a/tests/run_cli_chm02_cuda_known_seed_test.cmake +++ b/tests/run_cli_chm02_cuda_known_seed_test.cmake @@ -98,3 +98,10 @@ if(DEFINED REQUIRE_GPU_ASSIGN AND REQUIRE_GPU_ASSIGN) message(FATAL_ERROR "Expected GPU assignment path without CPU assign fallback, but CPU assign log was present.") endif() endif() + +if(DEFINED REQUIRE_GPU_ORDER_VALID AND REQUIRE_GPU_ORDER_VALID) + string(FIND "${stderr}" "[GraphCuIsAcyclic] GpuOrderValidationResult=0x00000000" gpu_order_valid_index) + if(gpu_order_valid_index EQUAL -1) + message(FATAL_ERROR "Expected GPU order validation success log, but it was not present.") + endif() +endif() From 3613919efcb54e0f678ad4cad287f179b4d3f77b Mon Sep 17 00:00:00 2001 From: Trent Nelson Date: Sun, 29 Mar 2026 21:03:00 -0700 Subject: [PATCH 06/14] Chm02 CUDA: add peel-order validation and GPU verify --- src/PerfectHash/Cu.h | 12 ++ src/PerfectHash/GraphCu.c | 17 ++- src/PerfectHashCuda/Graph.cu | 126 ++++++++++++++++++ src/PerfectHashCuda/PerfectHashCuda.def | 1 + .../run_cli_chm02_cuda_known_seed_test.cmake | 12 ++ 5 files changed, 167 insertions(+), 1 deletion(-) diff --git a/src/PerfectHash/Cu.h b/src/PerfectHash/Cu.h index 8b696653..f25fb1d1 100644 --- a/src/PerfectHash/Cu.h +++ b/src/PerfectHash/Cu.h @@ -2123,6 +2123,16 @@ HRESULT ); typedef PERFECT_HASH_CUDA_ASSIGN *PPERFECT_HASH_CUDA_ASSIGN; +typedef +HRESULT +(PERFECT_HASH_CUDA_VERIFY)( + _Inout_ struct _GRAPH *Graph, + _In_ ULONG BlocksPerGrid, + _In_ ULONG ThreadsPerBlock, + _In_ ULONG SharedMemoryInBytes + ); +typedef PERFECT_HASH_CUDA_VERIFY *PPERFECT_HASH_CUDA_VERIFY; + typedef HRESULT (PERFECT_HASH_CUDA_IS_GRAPH_ACYCLIC)( @@ -2154,6 +2164,8 @@ typedef PERFECT_HASH_CUDA_COUNT_NONEMPTY \ ENTRY(ASSIGN, Assign) \ \ + ENTRY(VERIFY, Verify) \ + \ LAST_ENTRY(IS_GRAPH_ACYCLIC, IsAcyclic) #define PERFECT_HASH_CUDA_FUNCTION_TABLE_ENTRY(ENTRY) \ diff --git a/src/PerfectHash/GraphCu.c b/src/PerfectHash/GraphCu.c index ac038ec9..888b038b 100644 --- a/src/PerfectHash/GraphCu.c +++ b/src/PerfectHash/GraphCu.c @@ -1854,9 +1854,24 @@ GraphCuVerify( _In_ PGRAPH Graph ) { + PCU Cu; HRESULT Result; - Result = Graph->CpuGraph->Vtbl->Verify(Graph->CpuGraph); + if (IsCudaDebugEnabled()) { + fprintf(stderr, "[GraphCuVerify] Enter\n"); + } + + Cu = Graph->CuSolveContext->DeviceContext->Cu; + + Result = Cu->Verify(Graph, + Graph->CuBlocksPerGrid, + Graph->CuThreadsPerBlock, + Graph->CuSharedMemory); + + if (IsCudaDebugEnabled()) { + fprintf(stderr, "[GraphCuVerify] GpuVerifyResult=0x%08x\n", (unsigned)Result); + } + return Result; } diff --git a/src/PerfectHashCuda/Graph.cu b/src/PerfectHashCuda/Graph.cu index 86211ea9..fbbc7504 100644 --- a/src/PerfectHashCuda/Graph.cu +++ b/src/PerfectHashCuda/Graph.cu @@ -1037,6 +1037,56 @@ GraphCuAssignSerialKernel( } } +template +GLOBAL +VOID +GraphCuVerifyKernel( + _In_ GraphType *Graph, + _Inout_ uint32_t *Failures + ) +{ + using AssignedType = typename GraphType::AssignedType; + using IndexType = typename GraphType::IndexType; + using KeyType = typename GraphType::KeyType; + using VertexPairType = typename GraphType::VertexPairType; + + uint32_t Index; + KeyType Key; + KeyType *Keys; + VertexPairType Hash; + const uint32_t NumberOfKeys = Graph->NumberOfKeys; + const int32_t Stride = gridDim.x * blockDim.x; + const typename GraphType::HashVertexType HashMask = + (typename GraphType::HashVertexType)(Graph->NumberOfVertices - 1); + const IndexType IndexMask = (IndexType)(Graph->NumberOfEdges - 1); + auto HashFunction = GraphGetHashFunction(Graph); + auto *Assigned = (AssignedType *)Graph->Assigned; + + Keys = (KeyType *)Graph->DeviceKeys; + Index = GlobalThreadIndex(); + + while (Index < NumberOfKeys) { + Key = Keys[Index]; + Hash = HashFunction(Key, HashMask); + + if (Hash.Vertex1 == Hash.Vertex2) { + atomicAdd(Failures, 1u); + Index += Stride; + continue; + } + + IndexType Result = (IndexType)( + (Assigned[Hash.Vertex1] + Assigned[Hash.Vertex2]) & IndexMask + ); + + if ((uint32_t)Result != Index) { + atomicAdd(Failures, 1u); + } + + Index += Stride; + } +} + EXTERN_C HOST HRESULT @@ -1186,4 +1236,80 @@ GraphCuAssign( return Result; } +EXTERN_C +HOST +HRESULT +GraphCuVerify( + _In_ PGRAPH Graph, + _In_ ULONG BlocksPerGrid, + _In_ ULONG ThreadsPerBlock, + _In_ ULONG SharedMemoryInBytes + ) +{ + HRESULT Result = S_OK; + uint32_t Failures = 0; + uint32_t *DeviceFailures = nullptr; + PGRAPH DeviceGraph; + CUstream_st *Stream; + PPH_CU_SOLVE_CONTEXT SolveContext; + ULONG SharedMemory = SharedMemoryInBytes; + + SolveContext = Graph->CuSolveContext; + DeviceGraph = SolveContext->DeviceGraph; + Stream = (CUstream_st *)SolveContext->Stream; + + if (IsUsingAssigned16(Graph)) { + Result = GetKernelConfig(Graph, + (PVOID)GraphCuVerifyKernel, + BlocksPerGrid, + ThreadsPerBlock, + SharedMemory); + } else { + Result = GetKernelConfig(Graph, + (PVOID)GraphCuVerifyKernel, + BlocksPerGrid, + ThreadsPerBlock, + SharedMemory); + } + + if (FAILED(Result)) { + goto End; + } + + CUDA_CALL(cudaMalloc((void **)&DeviceFailures, sizeof(*DeviceFailures))); + CUDA_CALL(cudaMemsetAsync(DeviceFailures, 0, sizeof(*DeviceFailures), Stream)); + + if (IsUsingAssigned16(Graph)) { + GraphCuVerifyKernel<<>>((PGRAPH16)DeviceGraph, + DeviceFailures); + } else { + GraphCuVerifyKernel<<>>((PGRAPH32)DeviceGraph, + DeviceFailures); + } + + CUDA_CALL(cudaMemcpyAsync(&Failures, + DeviceFailures, + sizeof(Failures), + cudaMemcpyDeviceToHost, + Stream)); + CUDA_CALL(cudaStreamSynchronize(Stream)); + + if (Failures > 0) { + Result = PH_E_TABLE_VERIFICATION_FAILED; + } + +End: + if (DeviceFailures) { + cudaFree(DeviceFailures); + } + + return Result; +} + // vim:set ts=8 sw=4 sts=4 tw=80 expandtab filetype=cuda formatoptions=croql : diff --git a/src/PerfectHashCuda/PerfectHashCuda.def b/src/PerfectHashCuda/PerfectHashCuda.def index 4b85ae06..0a5eaade 100644 --- a/src/PerfectHashCuda/PerfectHashCuda.def +++ b/src/PerfectHashCuda/PerfectHashCuda.def @@ -3,3 +3,4 @@ EXPORTS GraphCuIsAcyclic GraphCuAddKeys GraphCuAssign + GraphCuVerify diff --git a/tests/run_cli_chm02_cuda_known_seed_test.cmake b/tests/run_cli_chm02_cuda_known_seed_test.cmake index 347319a2..778f5b31 100644 --- a/tests/run_cli_chm02_cuda_known_seed_test.cmake +++ b/tests/run_cli_chm02_cuda_known_seed_test.cmake @@ -105,3 +105,15 @@ if(DEFINED REQUIRE_GPU_ORDER_VALID AND REQUIRE_GPU_ORDER_VALID) message(FATAL_ERROR "Expected GPU order validation success log, but it was not present.") endif() endif() + +if(DEFINED REQUIRE_GPU_VERIFY AND REQUIRE_GPU_VERIFY) + string(FIND "${stderr}" "[GraphCuVerify] GpuVerifyResult=0x00000000" gpu_verify_index) + if(gpu_verify_index EQUAL -1) + message(FATAL_ERROR "Expected GPU verify success log, but it was not present.") + endif() + + string(FIND "${stderr}" "[GraphCuVerify] CpuVerifyResult=" cpu_verify_index) + if(NOT cpu_verify_index EQUAL -1) + message(FATAL_ERROR "Expected GPU verify path without CPU verify fallback, but CPU verify log was present.") + endif() +endif() From c627a5695bc09f4ec4c9d6e384c7e3037398bf1f Mon Sep 17 00:00:00 2001 From: Trent Nelson Date: Sun, 29 Mar 2026 21:03:23 -0700 Subject: [PATCH 07/14] Chm02 CUDA: add first-class regression tests --- tests/CMakeLists.txt | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 96e7bcb7..eaa6687f 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -265,6 +265,40 @@ if(TARGET PerfectHashCreateExe AND TARGET PerfectHashBulkCreateExe) PROPERTIES LABELS fast ) + + if(PERFECTHASH_HAS_CUDA) + add_test( + NAME perfecthash.cuda.chm02.hologram.nofileio + COMMAND ${CMAKE_COMMAND} + -DTEST_EXE=$ + -DTEST_KEYS=${TEST_KEYS_FILE} + -DTEST_OUTPUT=${TEST_OUTPUT_DIR}/cuda-chm02-hologram-nofileio + "-DTEST_FLAGS=--NoFileIo;--DisableCsvOutputFile" + -DREQUIRE_GPU_ASSIGN=1 + -DREQUIRE_GPU_ORDER_VALID=1 + -DREQUIRE_GPU_VERIFY=1 + -P ${CMAKE_CURRENT_SOURCE_DIR}/run_cli_chm02_cuda_known_seed_test.cmake + ) + + add_test( + NAME perfecthash.cuda.chm02.hologram.fileio + COMMAND ${CMAKE_COMMAND} + -DTEST_EXE=$ + -DTEST_KEYS=${TEST_KEYS_FILE} + -DTEST_OUTPUT=${TEST_OUTPUT_DIR}/cuda-chm02-hologram-fileio + -DREQUIRE_GPU_ASSIGN=1 + -DREQUIRE_GPU_ORDER_VALID=1 + -DREQUIRE_GPU_VERIFY=1 + -P ${CMAKE_CURRENT_SOURCE_DIR}/run_cli_chm02_cuda_known_seed_test.cmake + ) + + set_tests_properties( + perfecthash.cuda.chm02.hologram.nofileio + perfecthash.cuda.chm02.hologram.fileio + PROPERTIES + LABELS "cuda" + ) + endif() else() message(STATUS "Skipping PerfectHash CLI tests for profile '${PERFECTHASH_BUILD_PROFILE_NORMALIZED}' (CLI executable targets are not available).") endif() From dd7215152179beaffa25824c6aedeeff312fe1d6 Mon Sep 17 00:00:00 2001 From: Trent Nelson Date: Sun, 29 Mar 2026 21:03:44 -0700 Subject: [PATCH 08/14] Chm02 CUDA: add generated non-Assigned16 regression --- tests/CMakeLists.txt | 10 +++ ..._cli_chm02_cuda_generated_33000_test.cmake | 67 +++++++++++++++++++ 2 files changed, 77 insertions(+) create mode 100644 tests/run_cli_chm02_cuda_generated_33000_test.cmake diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index eaa6687f..9c95ab2c 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -292,9 +292,19 @@ if(TARGET PerfectHashCreateExe AND TARGET PerfectHashBulkCreateExe) -P ${CMAKE_CURRENT_SOURCE_DIR}/run_cli_chm02_cuda_known_seed_test.cmake ) + add_test( + NAME perfecthash.cuda.chm02.generated33000.nofileio + COMMAND ${CMAKE_COMMAND} + -DTEST_EXE=$ + -DTEST_OUTPUT=${TEST_OUTPUT_DIR}/cuda-chm02-generated33000-nofileio + -DTEST_PYTHON=${Python3_EXECUTABLE} + -P ${CMAKE_CURRENT_SOURCE_DIR}/run_cli_chm02_cuda_generated_33000_test.cmake + ) + set_tests_properties( perfecthash.cuda.chm02.hologram.nofileio perfecthash.cuda.chm02.hologram.fileio + perfecthash.cuda.chm02.generated33000.nofileio PROPERTIES LABELS "cuda" ) diff --git a/tests/run_cli_chm02_cuda_generated_33000_test.cmake b/tests/run_cli_chm02_cuda_generated_33000_test.cmake new file mode 100644 index 00000000..17f53e40 --- /dev/null +++ b/tests/run_cli_chm02_cuda_generated_33000_test.cmake @@ -0,0 +1,67 @@ +if(NOT DEFINED TEST_EXE) + message(FATAL_ERROR "TEST_EXE is required") +endif() +if(NOT DEFINED TEST_OUTPUT) + message(FATAL_ERROR "TEST_OUTPUT is required") +endif() +if(NOT DEFINED TEST_PYTHON) + message(FATAL_ERROR "TEST_PYTHON is required") +endif() + +set(test_output "${TEST_OUTPUT}") +file(MAKE_DIRECTORY "${test_output}") + +set(generator_script "${test_output}/generate_cuda_33000_keys.py") +file(WRITE "${generator_script}" "import struct\n") +file(APPEND "${generator_script}" "from pathlib import Path\n") +file(APPEND "${generator_script}" "out = Path(r'''${test_output}''')\n") +file(APPEND "${generator_script}" "out.mkdir(parents=True, exist_ok=True)\n") +file(APPEND "${generator_script}" "count = 33000\n") +file(APPEND "${generator_script}" "salt = 0x13579BDF\n") +file(APPEND "${generator_script}" "multiplier = 2654435761\n") +file(APPEND "${generator_script}" "keys = []\n") +file(APPEND "${generator_script}" "i = 1\n") +file(APPEND "${generator_script}" "while len(keys) < count:\n") +file(APPEND "${generator_script}" " value = ((i * multiplier) ^ salt) & 0xffffffff\n") +file(APPEND "${generator_script}" " if value != 0 and value != 0xffffffff:\n") +file(APPEND "${generator_script}" " keys.append(value)\n") +file(APPEND "${generator_script}" " i += 1\n") +file(APPEND "${generator_script}" "keys.sort()\n") +file(APPEND "${generator_script}" "(out / 'generated-33000.keys').write_bytes(b''.join(struct.pack(' Date: Sun, 29 Mar 2026 21:04:06 -0700 Subject: [PATCH 09/14] Chm02 CUDA: add explicit timing surface --- src/PerfectHash/BulkCreateCsv.h | 16 +++++ src/PerfectHash/Chm02.c | 11 +++ src/PerfectHash/Chm02Compat.c | 11 +++ src/PerfectHash/Graph.h | 10 +++ src/PerfectHash/GraphCu.c | 61 ++++++++++++++++ src/PerfectHash/PerfectHashCompat.c | 2 +- src/PerfectHash/PerfectHashTable.h | 5 ++ src/PerfectHash/TableCreateCsv.h | 16 +++++ tests/CMakeLists.txt | 11 +++ tests/run_cli_chm02_cuda_perf_benchmark.cmake | 72 +++++++++++++++++++ 10 files changed, 214 insertions(+), 1 deletion(-) create mode 100644 tests/run_cli_chm02_cuda_perf_benchmark.cmake diff --git a/src/PerfectHash/BulkCreateCsv.h b/src/PerfectHash/BulkCreateCsv.h index 3dd73c83..dbf876e9 100644 --- a/src/PerfectHash/BulkCreateCsv.h +++ b/src/PerfectHash/BulkCreateCsv.h @@ -424,6 +424,22 @@ Module Name: Context->VerifyElapsedMicroseconds.QuadPart, \ OUTPUT_INT) \ \ + ENTRY(CuAddKeysMicroseconds, \ + Table->CuAddKeysElapsedMicroseconds.QuadPart, \ + OUTPUT_INT) \ + \ + ENTRY(CuIsAcyclicMicroseconds, \ + Table->CuIsAcyclicElapsedMicroseconds.QuadPart, \ + OUTPUT_INT) \ + \ + ENTRY(CuAssignMicroseconds, \ + Table->CuAssignElapsedMicroseconds.QuadPart, \ + OUTPUT_INT) \ + \ + ENTRY(CuVerifyMicroseconds, \ + Table->CuVerifyElapsedMicroseconds.QuadPart, \ + OUTPUT_INT) \ + \ ENTRY(BenchmarkWarmups, \ Table->BenchmarkWarmups, \ OUTPUT_INT) \ diff --git a/src/PerfectHash/Chm02.c b/src/PerfectHash/Chm02.c index 0fda6299..93689690 100644 --- a/src/PerfectHash/Chm02.c +++ b/src/PerfectHash/Chm02.c @@ -709,6 +709,17 @@ Return Value: COPY_GRAPH_COUNTERS_FROM_GRAPH_TO_TABLE(); +#ifdef PH_USE_CUDA + Table->CuAddKeysElapsedMicroseconds.QuadPart = + Graph->CuAddKeysElapsedMicroseconds.QuadPart; + Table->CuIsAcyclicElapsedMicroseconds.QuadPart = + Graph->CuIsAcyclicElapsedMicroseconds.QuadPart; + Table->CuAssignElapsedMicroseconds.QuadPart = + Graph->CuAssignElapsedMicroseconds.QuadPart; + Table->CuVerifyElapsedMicroseconds.QuadPart = + Graph->CuVerifyElapsedMicroseconds.QuadPart; +#endif + // // Note this graph as the one solved to the context. This is used by the // save file work callback we dispatch below. diff --git a/src/PerfectHash/Chm02Compat.c b/src/PerfectHash/Chm02Compat.c index 67c61732..a3c7f3e2 100644 --- a/src/PerfectHash/Chm02Compat.c +++ b/src/PerfectHash/Chm02Compat.c @@ -700,6 +700,17 @@ Return Value: COPY_GRAPH_COUNTERS_FROM_GRAPH_TO_TABLE(); +#ifdef PH_USE_CUDA + Table->CuAddKeysElapsedMicroseconds.QuadPart = + Graph->CuAddKeysElapsedMicroseconds.QuadPart; + Table->CuIsAcyclicElapsedMicroseconds.QuadPart = + Graph->CuIsAcyclicElapsedMicroseconds.QuadPart; + Table->CuAssignElapsedMicroseconds.QuadPart = + Graph->CuAssignElapsedMicroseconds.QuadPart; + Table->CuVerifyElapsedMicroseconds.QuadPart = + Graph->CuVerifyElapsedMicroseconds.QuadPart; +#endif + // // Capture RNG details from the winning graph if the RNG used was not the // System one. diff --git a/src/PerfectHash/Graph.h b/src/PerfectHash/Graph.h index 86a46296..9ba73cf6 100644 --- a/src/PerfectHash/Graph.h +++ b/src/PerfectHash/Graph.h @@ -1874,6 +1874,15 @@ typedef struct _Struct_size_bytes_(SizeOfStruct) _GRAPH { ULONG CuIsAcyclicPhase1Attempts; + // + // Explicit CUDA phase timings for performance benchmarking. + // + + LARGE_INTEGER CuAddKeysElapsedMicroseconds; + LARGE_INTEGER CuIsAcyclicElapsedMicroseconds; + LARGE_INTEGER CuAssignElapsedMicroseconds; + LARGE_INTEGER CuVerifyElapsedMicroseconds; + // // CUDA RNG details. // @@ -2157,6 +2166,7 @@ extern GRAPH_REGISTER_SOLVED GraphRegisterSolved; #ifdef _M_X64 extern GRAPH_REGISTER_SOLVED GraphRegisterSolvedTsx; #endif + extern GRAPH_SHOULD_WE_CONTINUE_TRYING_TO_SOLVE GraphShouldWeContinueTryingToSolve; extern GRAPH_ADD_KEYS GraphAddKeys; diff --git a/src/PerfectHash/GraphCu.c b/src/PerfectHash/GraphCu.c index 888b038b..1442ac86 100644 --- a/src/PerfectHash/GraphCu.c +++ b/src/PerfectHash/GraphCu.c @@ -1081,6 +1081,10 @@ Return Value: Graph->TraversalDepth = 0; Graph->TotalTraversals = 0; Graph->MaximumTraversalDepth = 0; + Graph->CuAddKeysElapsedMicroseconds.QuadPart = 0; + Graph->CuIsAcyclicElapsedMicroseconds.QuadPart = 0; + Graph->CuAssignElapsedMicroseconds.QuadPart = 0; + Graph->CuVerifyElapsedMicroseconds.QuadPart = 0; Graph->SolvedTime.AsULongLong = 0; @@ -1330,6 +1334,23 @@ IsCudaDebugEnabled( #endif } +FORCEINLINE +VOID +CaptureCuElapsedMicroseconds( + _In_ PGRAPH Graph, + _In_ LARGE_INTEGER Start, + _In_ LARGE_INTEGER End, + _Out_ PLARGE_INTEGER ElapsedMicroseconds + ) +{ + LONGLONG Cycles; + + Cycles = End.QuadPart - Start.QuadPart; + ElapsedMicroseconds->QuadPart = ( + (Cycles * 1000000) / Graph->Context->Frequency.QuadPart + ); +} + #define DEFINE_VALIDATE_GPU_ORDER(Name, \ OrderType, \ EdgeType, \ @@ -1528,6 +1549,8 @@ GraphCuAddKeys( _In_reads_(NumberOfKeys) PKEY Keys ) { + LARGE_INTEGER Start; + LARGE_INTEGER End; PCU Cu; HRESULT Result; @@ -1541,11 +1564,19 @@ GraphCuAddKeys( Cu = Graph->CuSolveContext->DeviceContext->Cu; + QueryPerformanceCounter(&Start); + Result = Cu->AddKeys(Graph, Graph->CuBlocksPerGrid, Graph->CuThreadsPerBlock, Graph->CuSharedMemory); + QueryPerformanceCounter(&End); + CaptureCuElapsedMicroseconds(Graph, + Start, + End, + &Graph->CuAddKeysElapsedMicroseconds); + if (IsCudaDebugEnabled()) { fprintf(stderr, "[GraphCuAddKeys] Result=0x%08x HashKeysResult=0x%08x " @@ -1564,6 +1595,8 @@ GraphCuIsAcyclic( _In_ PGRAPH Graph ) { + LARGE_INTEGER Start; + LARGE_INTEGER End; PCU Cu; PRTL Rtl; PKEY Keys; @@ -1584,6 +1617,8 @@ GraphCuIsAcyclic( SolveContext = Graph->CuSolveContext; Cu = SolveContext->DeviceContext->Cu; + QueryPerformanceCounter(&Start); + Result = Cu->IsAcyclic(Graph, Graph->CuBlocksPerGrid, Graph->CuThreadsPerBlock, @@ -1809,6 +1844,12 @@ GraphCuIsAcyclic( InterlockedIncrement64(&Context->GpuAndCpuIsAcyclicSuccess); + QueryPerformanceCounter(&End); + CaptureCuElapsedMicroseconds(Graph, + Start, + End, + &Graph->CuIsAcyclicElapsedMicroseconds); + return Result; } @@ -1817,6 +1858,8 @@ GraphCuAssign( _In_ PGRAPH Graph ) { + LARGE_INTEGER Start; + LARGE_INTEGER End; PCU Cu; HRESULT Result; @@ -1830,6 +1873,8 @@ GraphCuAssign( Cu = Graph->CuSolveContext->DeviceContext->Cu; + QueryPerformanceCounter(&Start); + Result = Cu->Assign(Graph, Graph->CuBlocksPerGrid, Graph->CuThreadsPerBlock, @@ -1846,6 +1891,12 @@ GraphCuAssign( Graph->Assigned, Graph->Info->AssignedSizeInBytes); + QueryPerformanceCounter(&End); + CaptureCuElapsedMicroseconds(Graph, + Start, + End, + &Graph->CuAssignElapsedMicroseconds); + return Result; } @@ -1854,6 +1905,8 @@ GraphCuVerify( _In_ PGRAPH Graph ) { + LARGE_INTEGER Start; + LARGE_INTEGER End; PCU Cu; HRESULT Result; @@ -1863,11 +1916,19 @@ GraphCuVerify( Cu = Graph->CuSolveContext->DeviceContext->Cu; + QueryPerformanceCounter(&Start); + Result = Cu->Verify(Graph, Graph->CuBlocksPerGrid, Graph->CuThreadsPerBlock, Graph->CuSharedMemory); + QueryPerformanceCounter(&End); + CaptureCuElapsedMicroseconds(Graph, + Start, + End, + &Graph->CuVerifyElapsedMicroseconds); + if (IsCudaDebugEnabled()) { fprintf(stderr, "[GraphCuVerify] GpuVerifyResult=0x%08x\n", (unsigned)Result); } diff --git a/src/PerfectHash/PerfectHashCompat.c b/src/PerfectHash/PerfectHashCompat.c index cc63c7f8..ddb88019 100644 --- a/src/PerfectHash/PerfectHashCompat.c +++ b/src/PerfectHash/PerfectHashCompat.c @@ -615,7 +615,7 @@ QueryPerformanceFrequency( _Out_ LARGE_INTEGER* lpFrequency ) { - lpFrequency->QuadPart = 1000; + lpFrequency->QuadPart = 1000000000ull; return TRUE; } diff --git a/src/PerfectHash/PerfectHashTable.h b/src/PerfectHash/PerfectHashTable.h index 21483401..33d516d5 100644 --- a/src/PerfectHash/PerfectHashTable.h +++ b/src/PerfectHash/PerfectHashTable.h @@ -870,6 +870,11 @@ typedef struct _Struct_size_bytes_(SizeOfStruct) _PERFECT_HASH_TABLE { DECL_GRAPH_COUNTERS_WITHIN_STRUCT(); + LARGE_INTEGER CuAddKeysElapsedMicroseconds; + LARGE_INTEGER CuIsAcyclicElapsedMicroseconds; + LARGE_INTEGER CuAssignElapsedMicroseconds; + LARGE_INTEGER CuVerifyElapsedMicroseconds; + // // Rng details from the winning graph. // diff --git a/src/PerfectHash/TableCreateCsv.h b/src/PerfectHash/TableCreateCsv.h index 8d4417ac..b1b5d828 100644 --- a/src/PerfectHash/TableCreateCsv.h +++ b/src/PerfectHash/TableCreateCsv.h @@ -432,6 +432,22 @@ Module Name: Context->VerifyElapsedMicroseconds.QuadPart, \ OUTPUT_INT) \ \ + ENTRY(CuAddKeysMicroseconds, \ + Table->CuAddKeysElapsedMicroseconds.QuadPart, \ + OUTPUT_INT) \ + \ + ENTRY(CuIsAcyclicMicroseconds, \ + Table->CuIsAcyclicElapsedMicroseconds.QuadPart, \ + OUTPUT_INT) \ + \ + ENTRY(CuAssignMicroseconds, \ + Table->CuAssignElapsedMicroseconds.QuadPart, \ + OUTPUT_INT) \ + \ + ENTRY(CuVerifyMicroseconds, \ + Table->CuVerifyElapsedMicroseconds.QuadPart, \ + OUTPUT_INT) \ + \ ENTRY(BenchmarkWarmups, \ Table->BenchmarkWarmups, \ OUTPUT_INT) \ diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 9c95ab2c..f4a836f4 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -301,10 +301,21 @@ if(TARGET PerfectHashCreateExe AND TARGET PerfectHashBulkCreateExe) -P ${CMAKE_CURRENT_SOURCE_DIR}/run_cli_chm02_cuda_generated_33000_test.cmake ) + add_test( + NAME perfecthash.cuda.chm02.perf-surface + COMMAND ${CMAKE_COMMAND} + -DTEST_EXE=$ + -DTEST_KEYS=${TEST_KEYS_FILE} + -DTEST_OUTPUT=${TEST_OUTPUT_DIR}/cuda-chm02-perf-surface + -DTEST_PYTHON=${Python3_EXECUTABLE} + -P ${CMAKE_CURRENT_SOURCE_DIR}/run_cli_chm02_cuda_perf_benchmark.cmake + ) + set_tests_properties( perfecthash.cuda.chm02.hologram.nofileio perfecthash.cuda.chm02.hologram.fileio perfecthash.cuda.chm02.generated33000.nofileio + perfecthash.cuda.chm02.perf-surface PROPERTIES LABELS "cuda" ) diff --git a/tests/run_cli_chm02_cuda_perf_benchmark.cmake b/tests/run_cli_chm02_cuda_perf_benchmark.cmake new file mode 100644 index 00000000..1bf4fbba --- /dev/null +++ b/tests/run_cli_chm02_cuda_perf_benchmark.cmake @@ -0,0 +1,72 @@ +if(NOT DEFINED TEST_EXE) + message(FATAL_ERROR "TEST_EXE is required") +endif() +if(NOT DEFINED TEST_KEYS) + message(FATAL_ERROR "TEST_KEYS is required") +endif() +if(NOT DEFINED TEST_OUTPUT) + message(FATAL_ERROR "TEST_OUTPUT is required") +endif() +if(NOT DEFINED TEST_PYTHON) + message(FATAL_ERROR "TEST_PYTHON is required") +endif() + +file(TO_NATIVE_PATH "${TEST_EXE}" test_exe_native) +file(TO_NATIVE_PATH "${TEST_KEYS}" test_keys_native) +file(TO_NATIVE_PATH "${TEST_OUTPUT}" test_output_native) +file(REMOVE_RECURSE "${test_output_native}") +file(MAKE_DIRECTORY "${test_output_native}") + +set(test_args + Chm02 + Mulshrolate3RX + And + 1 + --CuConcurrency=1 + --FixedAttempts=2 + --Seeds=0xF0192B55,0xD9C83970,0x0C1E0D10,0xD11A5847 + --NoFileIo + --SkipTestAfterCreate +) + +execute_process( + COMMAND "${test_exe_native}" "${test_keys_native}" "${test_output_native}" ${test_args} + RESULT_VARIABLE result + OUTPUT_VARIABLE stdout + ERROR_VARIABLE stderr +) + +if(NOT result EQUAL 0) + message(STATUS "stdout: ${stdout}") + message(STATUS "stderr: ${stderr}") + message(FATAL_ERROR "Command failed with exit code ${result}") +endif() + +file(GLOB csv_files "${test_output_native}/*.csv") +list(LENGTH csv_files csv_count) +if(NOT csv_count EQUAL 1) + message(FATAL_ERROR "Expected exactly one CSV output file, found ${csv_count}") +endif() + +set(csv_path "${csv_files}") + +execute_process( + COMMAND "${TEST_PYTHON}" -c + "import csv, sys; \ +path = sys.argv[1]; \ +required = ['CuAddKeysMicroseconds', 'CuIsAcyclicMicroseconds', 'CuAssignMicroseconds', 'CuVerifyMicroseconds']; \ +row = next(csv.DictReader(open(path, newline=''))); \ +missing = [name for name in required if name not in row]; \ +assert not missing, f'Missing timing field(s): {missing}'; \ +negative = [name for name in required if int(row[name]) < 0]; \ +assert not negative, f'Negative timing field(s): {negative}'" "${csv_path}" + RESULT_VARIABLE parse_result + OUTPUT_VARIABLE parse_stdout + ERROR_VARIABLE parse_stderr +) + +if(NOT parse_result EQUAL 0) + message(STATUS "stdout: ${parse_stdout}") + message(STATUS "stderr: ${parse_stderr}") + message(FATAL_ERROR "CSV timing validation failed (exit ${parse_result})") +endif() From ccdcd5ef10fdb6e0df1a6f4c035b7d5a24839c45 Mon Sep 17 00:00:00 2001 From: Trent Nelson Date: Sun, 29 Mar 2026 22:04:17 -0700 Subject: [PATCH 10/14] Chm02 CUDA: address review findings --- docs/chm02-cuda-mainline.md | 67 +++++++++++++++++++ src/PerfectHash/BulkCreateBestCsv.h | 6 +- src/PerfectHash/BulkCreateCsv.h | 6 +- src/PerfectHash/Chm02.c | 15 +---- src/PerfectHash/Graph.c | 24 ++----- src/PerfectHash/Graph.h | 10 ++- src/PerfectHash/GraphCu.c | 27 ++------ src/PerfectHash/PerfectHashPrivate.h | 13 ++++ src/PerfectHash/TableCreateBestCsv.h | 6 +- src/PerfectHash/TableCreateCsv.h | 6 +- src/PerfectHashCuda/Graph.cu | 10 +-- .../run_cli_chm02_cuda_known_seed_test.cmake | 10 --- 12 files changed, 110 insertions(+), 90 deletions(-) create mode 100644 docs/chm02-cuda-mainline.md diff --git a/docs/chm02-cuda-mainline.md b/docs/chm02-cuda-mainline.md new file mode 100644 index 00000000..26aaf558 --- /dev/null +++ b/docs/chm02-cuda-mainline.md @@ -0,0 +1,67 @@ +# Chm02 CUDA Mainline Note + +## Summary + +This note captures the intent of the `issue-79-chm02-cuda-mainline` branch. + +The branch promotes the legacy `Chm02` CUDA path from a CPU-assisted bring-up + state toward a first-class correctness path by moving the major solve phases + (`IsAcyclic`, `Assign`, `Verify`) onto the GPU while keeping CPU-oracle-style + validation and debugging support available during bring-up. + +## Goals + +- Fix correctness blockers in the existing `Chm02` CUDA path. +- Make known-seed CLI runs succeed on Linux in both no-file-io and file-io + configurations. +- Add regression coverage for: + - known-seed `Chm02` CUDA runs + - a generated non-`Assigned16` case + - timing-field presence +- Expose explicit per-phase CUDA timing fields for measurement. + +## Non-Goals + +- High-throughput GPU solving. +- Batched multi-attempt GPU construction. +- Replacing the standalone GPU peeling POC. +- Eliminating all CPU-oracle/debug-only code from the branch. + +The current `Chm02` CUDA implementation remains correctness-first, not + throughput-first. + +## Supported Scope + +- Algorithm: `Chm02` +- Hash path: known-good seeded hash families already supported by the repo +- CUDA path: single-graph bring-up / validation +- Platform focus: + - Linux with CUDA enabled + - existing regression coverage on the configured CUDA host + +## Fallback / Debugging Policy + +- Normal operation should use the GPU path for add-keys, acyclic detection, + assignment, and verify. +- CPU-oracle and order-validation logic is intended as bring-up/debug support. +- `PH_DEBUG_CUDA_CHM02` enables extra logging and validation details for + troubleshooting. + +## Timing Contract + +The following CSV fields are emitted: + +- `CuAddKeysMicroseconds` +- `CuIsAcyclicMicroseconds` +- `CuAssignMicroseconds` +- `CuVerifyMicroseconds` + +These are synchronized phase timings around the CUDA-backed phase wrappers, not + raw kernel-only device timings. + +## Acceptance + +- The focused CUDA `Chm02` regression tests pass when CUDA is enabled. +- Known-seed `Chm02` CUDA runs succeed on Linux. +- File-io and no-file-io paths both work in the covered scenarios. +- Timing fields are present and non-negative in CSV output. diff --git a/src/PerfectHash/BulkCreateBestCsv.h b/src/PerfectHash/BulkCreateBestCsv.h index 13e8019c..ee57f8c9 100644 --- a/src/PerfectHash/BulkCreateBestCsv.h +++ b/src/PerfectHash/BulkCreateBestCsv.h @@ -280,16 +280,12 @@ Module Name: Context->GpuAddKeysSuccessButCpuAddKeysFailures, \ OUTPUT_INT) \ \ - ENTRY(GpuIsAcyclicButCpuIsCyclicFailures, \ - Context->GpuIsAcyclicButCpuIsCyclicFailures, \ - OUTPUT_INT) \ - \ ENTRY(GpuAndCpuAddKeysSuccess, \ Context->GpuAndCpuAddKeysSuccess, \ OUTPUT_INT) \ \ ENTRY(GpuAndCpuIsAcyclicSuccess, \ - Context->GpuAndCpuAddKeysSuccess, \ + Context->GpuAndCpuIsAcyclicSuccess, \ OUTPUT_INT) \ \ ENTRY(BestCoverageAttempts, \ diff --git a/src/PerfectHash/BulkCreateCsv.h b/src/PerfectHash/BulkCreateCsv.h index dbf876e9..2871fc87 100644 --- a/src/PerfectHash/BulkCreateCsv.h +++ b/src/PerfectHash/BulkCreateCsv.h @@ -279,16 +279,12 @@ Module Name: Context->GpuAddKeysSuccessButCpuAddKeysFailures, \ OUTPUT_INT) \ \ - ENTRY(GpuIsAcyclicButCpuIsCyclicFailures, \ - Context->GpuIsAcyclicButCpuIsCyclicFailures, \ - OUTPUT_INT) \ - \ ENTRY(GpuAndCpuAddKeysSuccess, \ Context->GpuAndCpuAddKeysSuccess, \ OUTPUT_INT) \ \ ENTRY(GpuAndCpuIsAcyclicSuccess, \ - Context->GpuAndCpuAddKeysSuccess, \ + Context->GpuAndCpuIsAcyclicSuccess, \ OUTPUT_INT) \ \ ENTRY(BestCoverageAttempts, \ diff --git a/src/PerfectHash/Chm02.c b/src/PerfectHash/Chm02.c index 93689690..242a58fe 100644 --- a/src/PerfectHash/Chm02.c +++ b/src/PerfectHash/Chm02.c @@ -17,19 +17,6 @@ Module Name: #include "Chm01.h" #include "Chm02Private.h" -FORCEINLINE -BOOLEAN -IsChm02CudaDebugEnabled( - VOID - ) -{ -#ifdef PH_WINDOWS - return (GetEnvironmentVariableA("PH_DEBUG_CUDA_CHM02", NULL, 0) > 0); -#else - return (getenv("PH_DEBUG_CUDA_CHM02") != NULL); -#endif -} - // // Main table creation implementation routine for Chm02. // @@ -196,7 +183,7 @@ Return Value: TableCreateFlags.AsULongLong = Table->TableCreateFlags.AsULongLong; Silent = (TableCreateFlags.Silent != FALSE); - DebugCuda = IsChm02CudaDebugEnabled(); + DebugCuda = IsPerfectHashCudaDebugEnabled(); // // Initialize variables used if we jump to Error early-on. diff --git a/src/PerfectHash/Graph.c b/src/PerfectHash/Graph.c index bd35725b..17d1adb7 100644 --- a/src/PerfectHash/Graph.c +++ b/src/PerfectHash/Graph.c @@ -555,13 +555,7 @@ Return Value: Table = Context->Table; NumberOfKeys = Table->Keys->NumberOfKeys.LowPart; Edges = Keys = (PKEY)Table->Keys->KeyArrayBaseAddress; - DebugCudaSolve = FALSE; - -#ifdef PH_WINDOWS - DebugCudaSolve = (GetEnvironmentVariableA("PH_DEBUG_CUDA_CHM02", NULL, 0) > 0); -#else - DebugCudaSolve = (getenv("PH_DEBUG_CUDA_CHM02") != NULL); -#endif + DebugCudaSolve = (IsCuGraph(Graph) ? IsCudaDebugGraph(Graph) : FALSE); // // Attempt to add all the keys to the graph. @@ -4017,12 +4011,8 @@ Return Value: return E_POINTER; } - DebugCudaSolve = FALSE; -#ifdef PH_WINDOWS - DebugCudaSolve = (GetEnvironmentVariableA("PH_DEBUG_CUDA_CHM02", NULL, 0) > 0); -#else - DebugCudaSolve = (getenv("PH_DEBUG_CUDA_CHM02") != NULL); -#endif + DebugCudaSolve = (IsCuGraph(Graph) ? IsPerfectHashCudaDebugEnabled() : FALSE); + Graph->Flags.DebugCudaChm02 = DebugCudaSolve; // // Acquire the exclusive graph lock for the duration of the routine. The @@ -6975,13 +6965,7 @@ Return Value: CoverageType = Context->BestCoverageType; Attempt = Coverage->Attempt; ElapsedMilliseconds = GetTickCount64() - Context->StartMilliseconds; - DebugCudaSolve = FALSE; - -#ifdef PH_WINDOWS - DebugCudaSolve = (GetEnvironmentVariableA("PH_DEBUG_CUDA_CHM02", NULL, 0) > 0); -#else - DebugCudaSolve = (getenv("PH_DEBUG_CUDA_CHM02") != NULL); -#endif + DebugCudaSolve = (IsCuGraph(Graph) ? IsCudaDebugGraph(Graph) : FALSE); if (DebugCudaSolve && IsCuGraph(Graph)) { fprintf(stderr, diff --git a/src/PerfectHash/Graph.h b/src/PerfectHash/Graph.h index 9ba73cf6..884452e4 100644 --- a/src/PerfectHash/Graph.h +++ b/src/PerfectHash/Graph.h @@ -906,11 +906,18 @@ typedef union _GRAPH_FLAGS { ULONG HasUserSeeds:1; + // + // When set, enables PH_DEBUG_CUDA_CHM02 bring-up logging for this + // graph without re-reading the environment variable in hot paths. + // + + ULONG DebugCudaChm02:1; + // // Unused bits. // - ULONG Unused:10; + ULONG Unused:9; }; LONG AsLong; ULONG AsULong; @@ -935,6 +942,7 @@ C_ASSERT(sizeof(GRAPH_FLAGS) == sizeof(ULONG)); ((Graph)->Flags.WantsCuRandomHostSeeds != FALSE) #define IsGraphParanoid(Graph) ((Graph)->Flags.Paranoid != FALSE) #define IsUsingAssigned16(Graph) ((Graph)->Flags.UsingAssigned16 != FALSE) +#define IsCudaDebugGraph(Graph) ((Graph)->Flags.DebugCudaChm02 != FALSE) #define SetSpareGraph(Graph) (Graph->Flags.IsSpareGraph = TRUE) #define SetSpareCuGraph(Graph) (Graph->Flags.IsSpareCuGraph = TRUE) diff --git a/src/PerfectHash/GraphCu.c b/src/PerfectHash/GraphCu.c index 1442ac86..2e13d422 100644 --- a/src/PerfectHash/GraphCu.c +++ b/src/PerfectHash/GraphCu.c @@ -1321,19 +1321,6 @@ GraphCuLoadNewSeeds( return Result; } -FORCEINLINE -BOOLEAN -IsCudaDebugEnabled( - VOID - ) -{ -#ifdef PH_WINDOWS - return (GetEnvironmentVariableA("PH_DEBUG_CUDA_CHM02", NULL, 0) > 0); -#else - return (getenv("PH_DEBUG_CUDA_CHM02") != NULL); -#endif -} - FORCEINLINE VOID CaptureCuElapsedMicroseconds( @@ -1577,7 +1564,7 @@ GraphCuAddKeys( End, &Graph->CuAddKeysElapsedMicroseconds); - if (IsCudaDebugEnabled()) { + if (IsCudaDebugGraph(Graph)) { fprintf(stderr, "[GraphCuAddKeys] Result=0x%08x HashKeysResult=0x%08x " "VertexFailures=%u WarpFailures=%u\n", @@ -1624,7 +1611,7 @@ GraphCuIsAcyclic( Graph->CuThreadsPerBlock, Graph->CuSharedMemory); - if (IsCudaDebugEnabled()) { + if (IsCudaDebugGraph(Graph)) { fprintf(stderr, "[GraphCuIsAcyclic] GpuResult=0x%08x Attempts=%u " "DeletedEdges=%u OrderIndex=%ld\n", @@ -1711,7 +1698,7 @@ GraphCuIsAcyclic( InterlockedIncrement64(&Context->GpuAndCpuAddKeysSuccess); } - if (IsCudaDebugEnabled()) { + if (IsCudaDebugGraph(Graph)) { HRESULT CpuAcyclicResult; HRESULT GpuOrderValidationResult; LONG InvalidEdge; @@ -1863,7 +1850,7 @@ GraphCuAssign( PCU Cu; HRESULT Result; - if (IsCudaDebugEnabled()) { + if (IsCudaDebugGraph(Graph)) { fprintf(stderr, "[GraphCuAssign] Enter IsAcyclic=%u OrderIndex=%ld CpuOrderIndex=%ld\n", (unsigned)Graph->Flags.IsAcyclic, @@ -1880,7 +1867,7 @@ GraphCuAssign( Graph->CuThreadsPerBlock, Graph->CuSharedMemory); - if (IsCudaDebugEnabled()) { + if (IsCudaDebugGraph(Graph)) { fprintf(stderr, "[GraphCuAssign] GpuAssignResult=0x%08x\n", (unsigned)Result); } if (FAILED(Result)) { @@ -1910,7 +1897,7 @@ GraphCuVerify( PCU Cu; HRESULT Result; - if (IsCudaDebugEnabled()) { + if (IsCudaDebugGraph(Graph)) { fprintf(stderr, "[GraphCuVerify] Enter\n"); } @@ -1929,7 +1916,7 @@ GraphCuVerify( End, &Graph->CuVerifyElapsedMicroseconds); - if (IsCudaDebugEnabled()) { + if (IsCudaDebugGraph(Graph)) { fprintf(stderr, "[GraphCuVerify] GpuVerifyResult=0x%08x\n", (unsigned)Result); } diff --git a/src/PerfectHash/PerfectHashPrivate.h b/src/PerfectHash/PerfectHashPrivate.h index 17fb0bb5..c0c00d7d 100644 --- a/src/PerfectHash/PerfectHashPrivate.h +++ b/src/PerfectHash/PerfectHashPrivate.h @@ -48,6 +48,19 @@ extern MODULEINFO PerfectHashModuleInfo; extern volatile ULONG CtrlCPressed; +FORCEINLINE +BOOLEAN +IsPerfectHashCudaDebugEnabled( + VOID + ) +{ +#ifdef PH_WINDOWS + return (GetEnvironmentVariableA("PH_DEBUG_CUDA_CHM02", NULL, 0) > 0); +#else + return (getenv("PH_DEBUG_CUDA_CHM02") != NULL); +#endif +} + // // Define a helper macro for validating flags passed as parameters to routines. // diff --git a/src/PerfectHash/TableCreateBestCsv.h b/src/PerfectHash/TableCreateBestCsv.h index fc20a48f..5b610151 100644 --- a/src/PerfectHash/TableCreateBestCsv.h +++ b/src/PerfectHash/TableCreateBestCsv.h @@ -280,16 +280,12 @@ Module Name: Context->GpuAddKeysSuccessButCpuAddKeysFailures, \ OUTPUT_INT) \ \ - ENTRY(GpuIsAcyclicButCpuIsCyclicFailures, \ - Context->GpuIsAcyclicButCpuIsCyclicFailures, \ - OUTPUT_INT) \ - \ ENTRY(GpuAndCpuAddKeysSuccess, \ Context->GpuAndCpuAddKeysSuccess, \ OUTPUT_INT) \ \ ENTRY(GpuAndCpuIsAcyclicSuccess, \ - Context->GpuAndCpuAddKeysSuccess, \ + Context->GpuAndCpuIsAcyclicSuccess, \ OUTPUT_INT) \ \ ENTRY(BestCoverageAttempts, \ diff --git a/src/PerfectHash/TableCreateCsv.h b/src/PerfectHash/TableCreateCsv.h index b1b5d828..a4f525b6 100644 --- a/src/PerfectHash/TableCreateCsv.h +++ b/src/PerfectHash/TableCreateCsv.h @@ -279,16 +279,12 @@ Module Name: Context->GpuAddKeysSuccessButCpuAddKeysFailures, \ OUTPUT_INT) \ \ - ENTRY(GpuIsAcyclicButCpuIsCyclicFailures, \ - Context->GpuIsAcyclicButCpuIsCyclicFailures, \ - OUTPUT_INT) \ - \ ENTRY(GpuAndCpuAddKeysSuccess, \ Context->GpuAndCpuAddKeysSuccess, \ OUTPUT_INT) \ \ ENTRY(GpuAndCpuIsAcyclicSuccess, \ - Context->GpuAndCpuAddKeysSuccess, \ + Context->GpuAndCpuIsAcyclicSuccess, \ OUTPUT_INT) \ \ ENTRY(BestCoverageAttempts, \ diff --git a/src/PerfectHashCuda/Graph.cu b/src/PerfectHashCuda/Graph.cu index fbbc7504..d4ece595 100644 --- a/src/PerfectHashCuda/Graph.cu +++ b/src/PerfectHashCuda/Graph.cu @@ -926,10 +926,6 @@ GraphCuIsAcyclicSerialKernel( VertexType Vertex1 = Edge3->Vertex1; VertexType Vertex2 = Edge3->Vertex2; - if (Vertices3[VertexIndex].Degree != 1) { - continue; - } - if (Vertices3[Vertex1].Degree > 0) { --Vertices3[Vertex1].Degree; Vertices3[Vertex1].Edges ^= Edge; @@ -1029,7 +1025,11 @@ GraphCuAssignSerialKernel( Value = (AssignedType)(Value + NumberOfEdges); } - ASSERT(Assigned[Vertex1] == INITIAL_ASSIGNMENT_VALUE); + // + // Assigned[] is expected to be reset to INITIAL_ASSIGNMENT_VALUE before + // this kernel runs. Avoid trapping the device here; surface any + // actual behavioral regressions through the normal verify path. + // Assigned[Vertex1] = Value; GraphCuRegisterVertexVisit(Graph, Vertex1); diff --git a/tests/run_cli_chm02_cuda_known_seed_test.cmake b/tests/run_cli_chm02_cuda_known_seed_test.cmake index 778f5b31..7486f564 100644 --- a/tests/run_cli_chm02_cuda_known_seed_test.cmake +++ b/tests/run_cli_chm02_cuda_known_seed_test.cmake @@ -92,11 +92,6 @@ if(DEFINED REQUIRE_GPU_ASSIGN AND REQUIRE_GPU_ASSIGN) if(gpu_assign_index EQUAL -1) message(FATAL_ERROR "Expected GPU assignment success log, but it was not present.") endif() - - string(FIND "${stderr}" "[GraphCuAssign] CpuAssignResult=" cpu_assign_index) - if(NOT cpu_assign_index EQUAL -1) - message(FATAL_ERROR "Expected GPU assignment path without CPU assign fallback, but CPU assign log was present.") - endif() endif() if(DEFINED REQUIRE_GPU_ORDER_VALID AND REQUIRE_GPU_ORDER_VALID) @@ -111,9 +106,4 @@ if(DEFINED REQUIRE_GPU_VERIFY AND REQUIRE_GPU_VERIFY) if(gpu_verify_index EQUAL -1) message(FATAL_ERROR "Expected GPU verify success log, but it was not present.") endif() - - string(FIND "${stderr}" "[GraphCuVerify] CpuVerifyResult=" cpu_verify_index) - if(NOT cpu_verify_index EQUAL -1) - message(FATAL_ERROR "Expected GPU verify path without CPU verify fallback, but CPU verify log was present.") - endif() endif() From c119c5b7f24c05295cf65385d7b8a96cebaabff7 Mon Sep 17 00:00:00 2001 From: Trent Nelson Date: Sun, 29 Mar 2026 22:08:28 -0700 Subject: [PATCH 11/14] Chm02 CUDA: expand mainline design note --- docs/chm02-cuda-mainline.md | 52 +++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/docs/chm02-cuda-mainline.md b/docs/chm02-cuda-mainline.md index 26aaf558..749a343f 100644 --- a/docs/chm02-cuda-mainline.md +++ b/docs/chm02-cuda-mainline.md @@ -39,6 +39,13 @@ The current `Chm02` CUDA implementation remains correctness-first, not - Linux with CUDA enabled - existing regression coverage on the configured CUDA host +The following supporting code changes are considered in-scope for this branch: + +- Linux file-work compatibility fixes needed for the `Chm02Compat` path +- CSV/timing schema updates needed to surface CUDA phase timing +- the Linux `QueryPerformanceFrequency()` correction that makes those timings + sane on non-Windows builds + ## Fallback / Debugging Policy - Normal operation should use the GPU path for add-keys, acyclic detection, @@ -59,6 +66,51 @@ The following CSV fields are emitted: These are synchronized phase timings around the CUDA-backed phase wrappers, not raw kernel-only device timings. +Compatibility note: + +- this branch intentionally changes the CSV schema by removing the stale + `GpuIsAcyclicButCpuIsCyclicFailures` field and adding the four `Cu*` + timing fields above +- downstream CSV consumers should treat this as a schema change for the + correctness-first CUDA bring-up path +- the existing non-CUDA timing fields should continue to use the same timing + base; the Linux `QueryPerformanceFrequency()` fix is included specifically so + those timings remain coherent on this platform + +## Failure-Path Expectations + +- Cyclic graphs are expected to return normal non-success solve results; they + are not considered internal errors. +- CUDA-disabled builds are expected to continue using the non-CUDA code paths. +- GPU order-validation and extra CPU-oracle diagnostics are debug-only aids, + controlled by `PH_DEBUG_CUDA_CHM02`. +- Non-debug runs are expected to surface failure through the normal `HRESULT` + and verification paths, not through verbose stderr diagnostics. + +## Debug Surface + +The following debug surface is intentionally supported for this bring-up phase: + +- `PH_DEBUG_CUDA_CHM02` +- stderr logging from the CUDA `Chm02` path +- known-seed regression harnesses that use the debug log to confirm the expected + GPU path was exercised + +This surface is explicitly considered temporary bring-up instrumentation, not a + long-term stable user-facing API. + +## Staged Task List + +1. Fix correctness blockers in the legacy CUDA `Chm02` path. +2. Establish known-seed Linux no-file-io parity. +3. Establish Linux file-io parity. +4. Move assignment and verify onto the GPU. +5. Add focused CUDA regression coverage: + - known-seed path + - non-`Assigned16` generated path + - timing-field presence +6. Expose explicit per-phase CUDA timing fields for measurement. + ## Acceptance - The focused CUDA `Chm02` regression tests pass when CUDA is enabled. From 7507e4bd185d938c8a9510700ee7d97bf32642fc Mon Sep 17 00:00:00 2001 From: Trent Nelson Date: Mon, 30 Mar 2026 08:45:50 -0700 Subject: [PATCH 12/14] Chm02 CUDA: tighten review contracts --- docs/chm02-cuda-mainline.md | 24 +++++++++++-------- src/PerfectHash/BulkCreateBestCsv.h | 4 ++++ src/PerfectHash/BulkCreateCsv.h | 4 ++++ src/PerfectHash/GraphCu.c | 12 ++++++++++ src/PerfectHash/TableCreateBestCsv.h | 4 ++++ src/PerfectHash/TableCreateCsv.h | 4 ++++ src/PerfectHashCuda/Graph.cu | 12 ++++++---- .../run_cli_chm02_cuda_known_seed_test.cmake | 12 +++++----- 8 files changed, 55 insertions(+), 21 deletions(-) diff --git a/docs/chm02-cuda-mainline.md b/docs/chm02-cuda-mainline.md index 749a343f..507b2478 100644 --- a/docs/chm02-cuda-mainline.md +++ b/docs/chm02-cuda-mainline.md @@ -68,14 +68,14 @@ These are synchronized phase timings around the CUDA-backed phase wrappers, not Compatibility note: -- this branch intentionally changes the CSV schema by removing the stale - `GpuIsAcyclicButCpuIsCyclicFailures` field and adding the four `Cu*` - timing fields above -- downstream CSV consumers should treat this as a schema change for the - correctness-first CUDA bring-up path +- this branch preserves the historical + `GpuIsAcyclicButCpuIsCyclicFailures` column as a zero-valued compatibility + stub in order to keep downstream CSV column positions stable +- this branch intentionally adds the four `Cu*` timing fields above - the existing non-CUDA timing fields should continue to use the same timing base; the Linux `QueryPerformanceFrequency()` fix is included specifically so - those timings remain coherent on this platform + those timings remain coherent on this platform as well as for the new CUDA + timing fields ## Failure-Path Expectations @@ -86,6 +86,8 @@ Compatibility note: controlled by `PH_DEBUG_CUDA_CHM02`. - Non-debug runs are expected to surface failure through the normal `HRESULT` and verification paths, not through verbose stderr diagnostics. +- The current serial CUDA kernels are correctness-first and must not be treated + as throughput-optimized production behavior. ## Debug Surface @@ -93,8 +95,10 @@ The following debug surface is intentionally supported for this bring-up phase: - `PH_DEBUG_CUDA_CHM02` - stderr logging from the CUDA `Chm02` path -- known-seed regression harnesses that use the debug log to confirm the expected - GPU path was exercised +- stable debug tokens used by the known-seed regression harnesses: + - `PH_CHM02_CUDA_ORDER_OK` + - `PH_CHM02_CUDA_ASSIGN_OK` + - `PH_CHM02_CUDA_VERIFY_OK` This surface is explicitly considered temporary bring-up instrumentation, not a long-term stable user-facing API. @@ -105,11 +109,11 @@ This surface is explicitly considered temporary bring-up instrumentation, not a 2. Establish known-seed Linux no-file-io parity. 3. Establish Linux file-io parity. 4. Move assignment and verify onto the GPU. -5. Add focused CUDA regression coverage: +5. Expose explicit per-phase CUDA timing fields for measurement. +6. Add focused CUDA regression coverage: - known-seed path - non-`Assigned16` generated path - timing-field presence -6. Expose explicit per-phase CUDA timing fields for measurement. ## Acceptance diff --git a/src/PerfectHash/BulkCreateBestCsv.h b/src/PerfectHash/BulkCreateBestCsv.h index ee57f8c9..fcb609f4 100644 --- a/src/PerfectHash/BulkCreateBestCsv.h +++ b/src/PerfectHash/BulkCreateBestCsv.h @@ -280,6 +280,10 @@ Module Name: Context->GpuAddKeysSuccessButCpuAddKeysFailures, \ OUTPUT_INT) \ \ + ENTRY(GpuIsAcyclicButCpuIsCyclicFailures, \ + 0, \ + OUTPUT_INT) \ + \ ENTRY(GpuAndCpuAddKeysSuccess, \ Context->GpuAndCpuAddKeysSuccess, \ OUTPUT_INT) \ diff --git a/src/PerfectHash/BulkCreateCsv.h b/src/PerfectHash/BulkCreateCsv.h index 2871fc87..ed7f83ad 100644 --- a/src/PerfectHash/BulkCreateCsv.h +++ b/src/PerfectHash/BulkCreateCsv.h @@ -279,6 +279,10 @@ Module Name: Context->GpuAddKeysSuccessButCpuAddKeysFailures, \ OUTPUT_INT) \ \ + ENTRY(GpuIsAcyclicButCpuIsCyclicFailures, \ + 0, \ + OUTPUT_INT) \ + \ ENTRY(GpuAndCpuAddKeysSuccess, \ Context->GpuAndCpuAddKeysSuccess, \ OUTPUT_INT) \ diff --git a/src/PerfectHash/GraphCu.c b/src/PerfectHash/GraphCu.c index 2e13d422..9da08365 100644 --- a/src/PerfectHash/GraphCu.c +++ b/src/PerfectHash/GraphCu.c @@ -1793,6 +1793,7 @@ GraphCuIsAcyclic( fprintf(stderr, "[GraphCuIsAcyclic] GpuOrderValidationResult=0x%08x\n", (unsigned)GpuOrderValidationResult); + fprintf(stderr, "PH_CHM02_CUDA_ORDER_OK\n"); } else { fprintf(stderr, "[GraphCuIsAcyclic] GpuOrderValidationResult=0x%08x " @@ -1869,11 +1870,19 @@ GraphCuAssign( if (IsCudaDebugGraph(Graph)) { fprintf(stderr, "[GraphCuAssign] GpuAssignResult=0x%08x\n", (unsigned)Result); + if (SUCCEEDED(Result)) { + fprintf(stderr, "PH_CHM02_CUDA_ASSIGN_OK\n"); + } } if (FAILED(Result)) { return Result; } + // + // The Cu graph load-info path allocates Assigned as managed memory and + // Cu->Assign() synchronizes the stream before returning, so the host copy + // below is reading coherent data. + // CopyMemory(Graph->CpuGraph->Assigned, Graph->Assigned, Graph->Info->AssignedSizeInBytes); @@ -1918,6 +1927,9 @@ GraphCuVerify( if (IsCudaDebugGraph(Graph)) { fprintf(stderr, "[GraphCuVerify] GpuVerifyResult=0x%08x\n", (unsigned)Result); + if (SUCCEEDED(Result)) { + fprintf(stderr, "PH_CHM02_CUDA_VERIFY_OK\n"); + } } return Result; diff --git a/src/PerfectHash/TableCreateBestCsv.h b/src/PerfectHash/TableCreateBestCsv.h index 5b610151..c023a3b9 100644 --- a/src/PerfectHash/TableCreateBestCsv.h +++ b/src/PerfectHash/TableCreateBestCsv.h @@ -280,6 +280,10 @@ Module Name: Context->GpuAddKeysSuccessButCpuAddKeysFailures, \ OUTPUT_INT) \ \ + ENTRY(GpuIsAcyclicButCpuIsCyclicFailures, \ + 0, \ + OUTPUT_INT) \ + \ ENTRY(GpuAndCpuAddKeysSuccess, \ Context->GpuAndCpuAddKeysSuccess, \ OUTPUT_INT) \ diff --git a/src/PerfectHash/TableCreateCsv.h b/src/PerfectHash/TableCreateCsv.h index a4f525b6..7611f087 100644 --- a/src/PerfectHash/TableCreateCsv.h +++ b/src/PerfectHash/TableCreateCsv.h @@ -279,6 +279,10 @@ Module Name: Context->GpuAddKeysSuccessButCpuAddKeysFailures, \ OUTPUT_INT) \ \ + ENTRY(GpuIsAcyclicButCpuIsCyclicFailures, \ + 0, \ + OUTPUT_INT) \ + \ ENTRY(GpuAndCpuAddKeysSuccess, \ Context->GpuAndCpuAddKeysSuccess, \ OUTPUT_INT) \ diff --git a/src/PerfectHashCuda/Graph.cu b/src/PerfectHashCuda/Graph.cu index d4ece595..8a3ffcb6 100644 --- a/src/PerfectHashCuda/Graph.cu +++ b/src/PerfectHashCuda/Graph.cu @@ -1010,7 +1010,7 @@ GraphCuAssignSerialKernel( const Edge3Type *Edge3 = &Edges3[Edge]; VertexType Vertex1; VertexType Vertex2; - AssignedType Value; + uint32_t Value; if (!GraphCuIsVisitedVertex(Graph, Edge3->Vertex1)) { Vertex1 = Edge3->Vertex1; @@ -1020,9 +1020,11 @@ GraphCuAssignSerialKernel( Vertex2 = Edge3->Vertex1; } - Value = (AssignedType)(Edge - Assigned[Vertex2]); - if (Value >= NumberOfEdges) { - Value = (AssignedType)(Value + NumberOfEdges); + Value = ((uint32_t)Edge + + (uint32_t)NumberOfEdges - + (uint32_t)Assigned[Vertex2]); + if (Value >= (uint32_t)NumberOfEdges) { + Value -= (uint32_t)NumberOfEdges; } // @@ -1030,7 +1032,7 @@ GraphCuAssignSerialKernel( // this kernel runs. Avoid trapping the device here; surface any // actual behavioral regressions through the normal verify path. // - Assigned[Vertex1] = Value; + Assigned[Vertex1] = (AssignedType)Value; GraphCuRegisterVertexVisit(Graph, Vertex1); GraphCuRegisterVertexVisit(Graph, Vertex2); diff --git a/tests/run_cli_chm02_cuda_known_seed_test.cmake b/tests/run_cli_chm02_cuda_known_seed_test.cmake index 7486f564..db82f59e 100644 --- a/tests/run_cli_chm02_cuda_known_seed_test.cmake +++ b/tests/run_cli_chm02_cuda_known_seed_test.cmake @@ -88,22 +88,22 @@ if(NOT failure_index EQUAL -1) endif() if(DEFINED REQUIRE_GPU_ASSIGN AND REQUIRE_GPU_ASSIGN) - string(FIND "${stderr}" "[GraphCuAssign] GpuAssignResult=0x00000000" gpu_assign_index) + string(FIND "${stderr}" "PH_CHM02_CUDA_ASSIGN_OK" gpu_assign_index) if(gpu_assign_index EQUAL -1) - message(FATAL_ERROR "Expected GPU assignment success log, but it was not present.") + message(FATAL_ERROR "Expected stable GPU assignment success token, but it was not present.") endif() endif() if(DEFINED REQUIRE_GPU_ORDER_VALID AND REQUIRE_GPU_ORDER_VALID) - string(FIND "${stderr}" "[GraphCuIsAcyclic] GpuOrderValidationResult=0x00000000" gpu_order_valid_index) + string(FIND "${stderr}" "PH_CHM02_CUDA_ORDER_OK" gpu_order_valid_index) if(gpu_order_valid_index EQUAL -1) - message(FATAL_ERROR "Expected GPU order validation success log, but it was not present.") + message(FATAL_ERROR "Expected stable GPU order-validation success token, but it was not present.") endif() endif() if(DEFINED REQUIRE_GPU_VERIFY AND REQUIRE_GPU_VERIFY) - string(FIND "${stderr}" "[GraphCuVerify] GpuVerifyResult=0x00000000" gpu_verify_index) + string(FIND "${stderr}" "PH_CHM02_CUDA_VERIFY_OK" gpu_verify_index) if(gpu_verify_index EQUAL -1) - message(FATAL_ERROR "Expected GPU verify success log, but it was not present.") + message(FATAL_ERROR "Expected stable GPU verify success token, but it was not present.") endif() endif() From 302f13b03656d7f975fcfa572b528f0f3b599533 Mon Sep 17 00:00:00 2001 From: Trent Nelson Date: Mon, 30 Mar 2026 17:16:09 -0700 Subject: [PATCH 13/14] Chm02 CUDA: fix managed bitmap and test contract --- docs/chm02-cuda-mainline.md | 28 +++++++++++++++-- src/PerfectHash/GraphCu.c | 63 +++++++++++++++++++++++++++++++++++-- 2 files changed, 86 insertions(+), 5 deletions(-) diff --git a/docs/chm02-cuda-mainline.md b/docs/chm02-cuda-mainline.md index 507b2478..dc768522 100644 --- a/docs/chm02-cuda-mainline.md +++ b/docs/chm02-cuda-mainline.md @@ -20,6 +20,15 @@ The branch promotes the legacy `Chm02` CUDA path from a CPU-assisted bring-up - timing-field presence - Expose explicit per-phase CUDA timing fields for measurement. +## Current Mainline Meaning + +For this branch, “mainline” means: + +- the major `Chm02` solve phases are GPU-backed +- the path is correctness-first, not throughput-first +- CPU graph state is still required as part of the current implementation for + bring-up compatibility and oracle-style validation support + ## Non-Goals - High-throughput GPU solving. @@ -33,7 +42,9 @@ The current `Chm02` CUDA implementation remains correctness-first, not ## Supported Scope - Algorithm: `Chm02` -- Hash path: known-good seeded hash families already supported by the repo +- Hash path: the branch is only accepted against the combinations covered by + the focused regression matrix below; broader hash-family support remains a + follow-on concern - CUDA path: single-graph bring-up / validation - Platform focus: - Linux with CUDA enabled @@ -103,6 +114,9 @@ The following debug surface is intentionally supported for this bring-up phase: This surface is explicitly considered temporary bring-up instrumentation, not a long-term stable user-facing API. +For this branch, however, the three `PH_CHM02_CUDA_*_OK` tokens are treated as + a supported test contract for the focused known-seed regression harness. + ## Staged Task List 1. Fix correctness blockers in the legacy CUDA `Chm02` path. @@ -114,10 +128,18 @@ This surface is explicitly considered temporary bring-up instrumentation, not a - known-seed path - non-`Assigned16` generated path - timing-field presence +7. Verify release-like behavior without relying on a silent CPU fallback: + - no-file-io path + - file-io path + - non-debug failure propagation remains via normal `HRESULT` / verify paths ## Acceptance - The focused CUDA `Chm02` regression tests pass when CUDA is enabled. -- Known-seed `Chm02` CUDA runs succeed on Linux. -- File-io and no-file-io paths both work in the covered scenarios. +- Known-seed Linux coverage passes for: + - HologramWorld known-seed, no-file-io + - HologramWorld known-seed, file-io +- Generated non-`Assigned16` coverage passes for: + - generated `33000`-key case - Timing fields are present and non-negative in CSV output. +- CUDA-disabled builds continue to use the non-CUDA path. diff --git a/src/PerfectHash/GraphCu.c b/src/PerfectHash/GraphCu.c index 9da08365..ea83efd7 100644 --- a/src/PerfectHash/GraphCu.c +++ b/src/PerfectHash/GraphCu.c @@ -295,6 +295,19 @@ Return Value: FREE_MANAGED_ARRAY(CuVertexLocks); FREE_MANAGED_ARRAY(CuEdgeLocks); +#define FREE_MANAGED_BITMAP_BUFFER(Name) \ + if (Graph->Name.Buffer != NULL) { \ + CuResult = Cu->MemFree((CU_DEVICE_POINTER)Graph->Name.Buffer); \ + if (CU_FAILED(CuResult)) { \ + CU_ERROR(GraphCuRundown_MemFree_##Name##_ManagedBitmapBuffer, \ + CuResult); \ + } else { \ + Graph->Name.Buffer = NULL; \ + } \ + } + + FREE_MANAGED_BITMAP_BUFFER(VisitedVerticesBitmap); + // // Free applicable assigned arrays. // @@ -704,10 +717,36 @@ Return Value: } ALLOC_HOST_BITMAP_BUFFER(DeletedEdgesBitmap); - ALLOC_HOST_BITMAP_BUFFER(VisitedVerticesBitmap); ALLOC_HOST_BITMAP_BUFFER(AssignedBitmap); ALLOC_HOST_BITMAP_BUFFER(IndexBitmap); +#define ALLOC_MANAGED_BITMAP_BUFFER(Name) \ + if (Info->Name##BufferSizeInBytes > 0) { \ + if (Graph->Name.Buffer != NULL) { \ + CuResult = Cu->MemFree((CU_DEVICE_POINTER)Graph->Name.Buffer); \ + if (CU_FAILED(CuResult)) { \ + CU_ERROR(GraphCuLoadInfo_MemFree_##Name##_ManagedBitmap, \ + CuResult); \ + Result = PH_E_CUDA_DRIVER_API_CALL_FAILED; \ + goto Error; \ + } \ + Graph->Name.Buffer = NULL; \ + } \ + CuResult = Cu->MemAllocManaged( \ + (PCU_DEVICE_POINTER)&Graph->Name.Buffer, \ + (SIZE_T)Info->Name##BufferSizeInBytes, \ + CU_MEM_ATTACH_GLOBAL \ + ); \ + if (CU_FAILED(CuResult)) { \ + CU_ERROR(GraphCuLoadInfo_MemAllocManaged_##Name##_Bitmap, \ + CuResult); \ + Result = PH_E_CUDA_DRIVER_API_CALL_FAILED; \ + goto Error; \ + } \ + } + + ALLOC_MANAGED_BITMAP_BUFFER(VisitedVerticesBitmap); + // // Check to see if we're in "first graph wins" mode, and have also been // asked to skip memory coverage information. If so, we can jump straight @@ -990,10 +1029,27 @@ Return Value: } ZERO_BITMAP_BUFFER(DeletedEdgesBitmap); - ZERO_BITMAP_BUFFER(VisitedVerticesBitmap); ZERO_BITMAP_BUFFER(AssignedBitmap); ZERO_BITMAP_BUFFER(IndexBitmap); +#define ZERO_MANAGED_BITMAP_BUFFER(Name) \ + if (Info->Name##BufferSizeInBytes > 0) { \ + CuResult = Cu->MemsetD8Async( \ + (PVOID)Graph->Name.Buffer, \ + 0, \ + Info->Name##BufferSizeInBytes, \ + SolveContext->Stream \ + ); \ + if (CU_FAILED(CuResult)) { \ + CU_ERROR(GraphCuReset_MemsetD8Async_##Name##_Bitmap, \ + CuResult); \ + Result = PH_E_CUDA_DRIVER_API_CALL_FAILED; \ + goto Error; \ + } \ + } + + ZERO_MANAGED_BITMAP_BUFFER(VisitedVerticesBitmap); + // // "Empty" all of the nodes. // @@ -1414,6 +1470,9 @@ Name( \ for (Index = NumberOfKeys; Index > 0; Index--) { \ ULONG OrderIndex; \ \ + Vertex1 = NULL; \ + Vertex2 = NULL; \ + \ OrderIndex = Index - 1; \ \ SignedEdge = (LONG)Order[OrderIndex]; \ From 67f957a6741f7bb7d9c05410a6e14ffd18d31ad0 Mon Sep 17 00:00:00 2001 From: Trent Nelson Date: Wed, 8 Apr 2026 15:03:07 -0700 Subject: [PATCH 14/14] Chm02 CUDA: add non-debug regression coverage --- docs/chm02-cuda-mainline.md | 6 ++++ src/PerfectHash/Graph.c | 30 ++++++++++++++++--- tests/CMakeLists.txt | 12 ++++++++ .../run_cli_chm02_cuda_known_seed_test.cmake | 28 ++++++++++++----- 4 files changed, 65 insertions(+), 11 deletions(-) diff --git a/docs/chm02-cuda-mainline.md b/docs/chm02-cuda-mainline.md index dc768522..13a3c801 100644 --- a/docs/chm02-cuda-mainline.md +++ b/docs/chm02-cuda-mainline.md @@ -117,6 +117,10 @@ This surface is explicitly considered temporary bring-up instrumentation, not a For this branch, however, the three `PH_CHM02_CUDA_*_OK` tokens are treated as a supported test contract for the focused known-seed regression harness. +In addition to the debug-token path, this branch also requires one non-debug + known-seed regression to pass, in order to prove that the release-like path + succeeds without depending on `PH_DEBUG_CUDA_CHM02`. + ## Staged Task List 1. Fix correctness blockers in the legacy CUDA `Chm02` path. @@ -126,6 +130,7 @@ For this branch, however, the three `PH_CHM02_CUDA_*_OK` tokens are treated as 5. Expose explicit per-phase CUDA timing fields for measurement. 6. Add focused CUDA regression coverage: - known-seed path + - non-debug known-seed path - non-`Assigned16` generated path - timing-field presence 7. Verify release-like behavior without relying on a silent CPU fallback: @@ -139,6 +144,7 @@ For this branch, however, the three `PH_CHM02_CUDA_*_OK` tokens are treated as - Known-seed Linux coverage passes for: - HologramWorld known-seed, no-file-io - HologramWorld known-seed, file-io + - HologramWorld known-seed, non-debug no-file-io - Generated non-`Assigned16` coverage passes for: - generated `33000`-key case - Timing fields are present and non-negative in CSV output. diff --git a/src/PerfectHash/Graph.c b/src/PerfectHash/Graph.c index 17d1adb7..1115a1c3 100644 --- a/src/PerfectHash/Graph.c +++ b/src/PerfectHash/Graph.c @@ -3869,8 +3869,19 @@ Return Value: Context->SpareGraph = NULL; Result = PH_S_USE_NEW_GRAPH_FOR_SOLVING; } else { - StopGraphSolving = TRUE; - Result = PH_S_STOP_GRAPH_SOLVING; + // + // The legacy invariant is that a spare graph exists whenever we + // want to keep solving after capturing the first solved graph. + // The current Chm02 CUDA correctness path can legitimately run + // without a spare graph under the covered single-graph settings, + // in which case we preserve the solved graph by stopping. + // + if (IsCuGraph(Graph)) { + StopGraphSolving = TRUE; + Result = PH_S_STOP_GRAPH_SOLVING; + } else { + ASSERT(SpareGraph != NULL); + } } } @@ -7018,8 +7029,19 @@ Return Value: Context->SpareGraph = NULL; Result = PH_S_USE_NEW_GRAPH_FOR_SOLVING; } else { - StopGraphSolving = TRUE; - Result = PH_S_STOP_GRAPH_SOLVING; + // + // The legacy invariant is that a spare graph exists whenever we + // want to keep solving after capturing the first solved graph. + // The current Chm02 CUDA correctness path can legitimately run + // without a spare graph under the covered single-graph settings, + // in which case we preserve the solved graph by stopping. + // + if (IsCuGraph(Graph)) { + StopGraphSolving = TRUE; + Result = PH_S_STOP_GRAPH_SOLVING; + } else { + ASSERT(SpareGraph != NULL); + } } } diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index f4a836f4..a2c12fe9 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -292,6 +292,17 @@ if(TARGET PerfectHashCreateExe AND TARGET PerfectHashBulkCreateExe) -P ${CMAKE_CURRENT_SOURCE_DIR}/run_cli_chm02_cuda_known_seed_test.cmake ) + add_test( + NAME perfecthash.cuda.chm02.hologram.nondebug + COMMAND ${CMAKE_COMMAND} + -DTEST_EXE=$ + -DTEST_KEYS=${TEST_KEYS_FILE} + -DTEST_OUTPUT=${TEST_OUTPUT_DIR}/cuda-chm02-hologram-nondebug + "-DTEST_FLAGS=--NoFileIo" + -DTEST_ENABLE_DEBUG=0 + -P ${CMAKE_CURRENT_SOURCE_DIR}/run_cli_chm02_cuda_known_seed_test.cmake + ) + add_test( NAME perfecthash.cuda.chm02.generated33000.nofileio COMMAND ${CMAKE_COMMAND} @@ -314,6 +325,7 @@ if(TARGET PerfectHashCreateExe AND TARGET PerfectHashBulkCreateExe) set_tests_properties( perfecthash.cuda.chm02.hologram.nofileio perfecthash.cuda.chm02.hologram.fileio + perfecthash.cuda.chm02.hologram.nondebug perfecthash.cuda.chm02.generated33000.nofileio perfecthash.cuda.chm02.perf-surface PROPERTIES diff --git a/tests/run_cli_chm02_cuda_known_seed_test.cmake b/tests/run_cli_chm02_cuda_known_seed_test.cmake index db82f59e..161c3300 100644 --- a/tests/run_cli_chm02_cuda_known_seed_test.cmake +++ b/tests/run_cli_chm02_cuda_known_seed_test.cmake @@ -8,6 +8,11 @@ if(NOT DEFINED TEST_OUTPUT) message(FATAL_ERROR "TEST_OUTPUT is required") endif() +set(test_enable_debug TRUE) +if(DEFINED TEST_ENABLE_DEBUG) + set(test_enable_debug "${TEST_ENABLE_DEBUG}") +endif() + file(TO_NATIVE_PATH "${TEST_EXE}" test_exe_native) file(TO_NATIVE_PATH "${TEST_KEYS}" test_keys_native) file(TO_NATIVE_PATH "${TEST_OUTPUT}" test_output_native) @@ -67,13 +72,22 @@ else() list(APPEND args "--NoFileIo" "--DisableCsvOutputFile") endif() -execute_process( - COMMAND ${CMAKE_COMMAND} -E env PH_DEBUG_CUDA_CHM02=1 - "${test_exe_native}" "${test_keys_native}" "${test_output_native}" ${args} - RESULT_VARIABLE result - OUTPUT_VARIABLE stdout - ERROR_VARIABLE stderr -) +if(test_enable_debug) + execute_process( + COMMAND ${CMAKE_COMMAND} -E env PH_DEBUG_CUDA_CHM02=1 + "${test_exe_native}" "${test_keys_native}" "${test_output_native}" ${args} + RESULT_VARIABLE result + OUTPUT_VARIABLE stdout + ERROR_VARIABLE stderr + ) +else() + execute_process( + COMMAND "${test_exe_native}" "${test_keys_native}" "${test_output_native}" ${args} + RESULT_VARIABLE result + OUTPUT_VARIABLE stdout + ERROR_VARIABLE stderr + ) +endif() message(STATUS "stdout: ${stdout}") message(STATUS "stderr: ${stderr}")