Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion common/regex-partial.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ std::string regex_to_reversed_partial_regex(const std::string & pattern) {
auto is_star = *it == '*';
++it;
if (is_star) {
if (*it == '?') {
if (it != end && *it == '?') {
++it;
}
}
Expand Down
90 changes: 90 additions & 0 deletions ggml/src/ggml-cpu/simd-gemm.h
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,96 @@ static void simd_gemm(
C += N;
}
}
#elif defined(GGML_SIMD) && defined(__riscv_v_intrinsic)
// RM accumulators + 1 B vector = RM + 1 <= 8 => RM <= 7
// Microkernel: C[RM x vl] += A[RM x K] * B[K x N]
template <int RM>
static inline void rvv_simd_gemm_ukernel(
float * GGML_RESTRICT C,
const float * GGML_RESTRICT A,
const float * GGML_RESTRICT B,
int K, int N, size_t vl)
{
static_assert(RM >= 1 && RM <= 7, "RM must be 1..7 for LMUL=4");

vfloat32m4_t acc_0 = __riscv_vle32_v_f32m4(C + 0 * N, vl);
vfloat32m4_t acc_1, acc_2, acc_3, acc_4, acc_5, acc_6;
if constexpr (RM > 1) acc_1 = __riscv_vle32_v_f32m4(C + 1 * N, vl);
if constexpr (RM > 2) acc_2 = __riscv_vle32_v_f32m4(C + 2 * N, vl);
if constexpr (RM > 3) acc_3 = __riscv_vle32_v_f32m4(C + 3 * N, vl);
if constexpr (RM > 4) acc_4 = __riscv_vle32_v_f32m4(C + 4 * N, vl);
if constexpr (RM > 5) acc_5 = __riscv_vle32_v_f32m4(C + 5 * N, vl);
if constexpr (RM > 6) acc_6 = __riscv_vle32_v_f32m4(C + 6 * N, vl);

for (int kk = 0; kk < K; kk++) {
vfloat32m4_t b_0 = __riscv_vle32_v_f32m4(B + kk * N, vl);

acc_0 = __riscv_vfmacc_vf_f32m4(acc_0, A[0 * K + kk], b_0, vl);
if constexpr (RM > 1) acc_1 = __riscv_vfmacc_vf_f32m4(acc_1, A[1 * K + kk], b_0, vl);
if constexpr (RM > 2) acc_2 = __riscv_vfmacc_vf_f32m4(acc_2, A[2 * K + kk], b_0, vl);
if constexpr (RM > 3) acc_3 = __riscv_vfmacc_vf_f32m4(acc_3, A[3 * K + kk], b_0, vl);
if constexpr (RM > 4) acc_4 = __riscv_vfmacc_vf_f32m4(acc_4, A[4 * K + kk], b_0, vl);
if constexpr (RM > 5) acc_5 = __riscv_vfmacc_vf_f32m4(acc_5, A[5 * K + kk], b_0, vl);
if constexpr (RM > 6) acc_6 = __riscv_vfmacc_vf_f32m4(acc_6, A[6 * K + kk], b_0, vl);
}

__riscv_vse32_v_f32m4(C + 0 * N, acc_0, vl);
if constexpr (RM > 1) __riscv_vse32_v_f32m4(C + 1 * N, acc_1, vl);
if constexpr (RM > 2) __riscv_vse32_v_f32m4(C + 2 * N, acc_2, vl);
if constexpr (RM > 3) __riscv_vse32_v_f32m4(C + 3 * N, acc_3, vl);
if constexpr (RM > 4) __riscv_vse32_v_f32m4(C + 4 * N, acc_4, vl);
if constexpr (RM > 5) __riscv_vse32_v_f32m4(C + 5 * N, acc_5, vl);
if constexpr (RM > 6) __riscv_vse32_v_f32m4(C + 6 * N, acc_6, vl);
}

template <int RM>
static inline void rvv_simd_gemm_dispatch_tail(
float * GGML_RESTRICT C,
const float * GGML_RESTRICT A,
const float * GGML_RESTRICT B,
int K, int N, int KN, int remaining_rows)
{
if constexpr (RM > 0) {
if (remaining_rows == RM) {
int64_t jj = 0;
for (; jj + KN <= N; jj += KN) {
rvv_simd_gemm_ukernel<RM>(C + jj, A, B + jj, K, N, KN);
}
if (jj < N) {
rvv_simd_gemm_ukernel<RM>(C + jj, A, B + jj, K, N, N - jj);
}
} else {
rvv_simd_gemm_dispatch_tail<RM - 1>(C, A, B, K, N, KN, remaining_rows);
}
}
}

static constexpr int GEMM_RM = 7;

// C[M x N] += A[M x K] * B[K x N]
static void simd_gemm(
float * GGML_RESTRICT C,
const float * GGML_RESTRICT A,
const float * GGML_RESTRICT B,
int M, int K, int N)
{
const int KN = (int)__riscv_vlenb();
int64_t ii = 0;
for (; ii + GEMM_RM <= M; ii += GEMM_RM) {
int64_t jj = 0;
for (; jj + KN <= N; jj += KN) {
rvv_simd_gemm_ukernel<GEMM_RM>(C + jj, A, B + jj, K, N, KN);
}
if (jj < N) {
rvv_simd_gemm_ukernel<GEMM_RM>(C + jj, A, B + jj, K, N, N - jj);
}
A += GEMM_RM * K;
C += GEMM_RM * N;
}

int remaining_rows = M - ii;
rvv_simd_gemm_dispatch_tail<GEMM_RM - 1>(C, A, B, K, N, KN, remaining_rows);
}

#if defined(__GNUC__) && !defined(__clang__)
#pragma GCC diagnostic pop
Expand Down
32 changes: 21 additions & 11 deletions ggml/src/ggml-cuda/gated_delta_net.cu
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
#include "gated_delta_net.cuh"

template <int S_v, bool KDA>
__global__ void gated_delta_net_cuda(const float * q,
__global__ void __launch_bounds__((ggml_cuda_get_physical_warp_size() < S_v ? ggml_cuda_get_physical_warp_size() : S_v) * 4, 2)
gated_delta_net_cuda(const float * q,
const float * k,
const float * v,
const float * g,
Expand Down Expand Up @@ -38,18 +39,19 @@ __global__ void gated_delta_net_cuda(const float * q,

const int64_t state_offset = (sequence * H + h_idx) * S_v * S_v;
state += state_offset;
curr_state += state_offset;
curr_state += state_offset + col * S_v;
attn_data += (sequence * n_tokens * H + h_idx) * S_v;

constexpr int warp_size = ggml_cuda_get_physical_warp_size() < S_v ? ggml_cuda_get_physical_warp_size() : S_v;
static_assert(S_v % warp_size == 0, "S_v must be a multiple of warp_size");
constexpr int rows_per_lane = (S_v + warp_size - 1) / warp_size;
float s_shard[rows_per_lane];
// state is stored transposed: M[col][i] = S[i][col], row col is contiguous

#pragma unroll
for (int r = 0; r < rows_per_lane; r++) {
const int i = r * warp_size + lane;
s_shard[r] = curr_state[col * S_v + i];
s_shard[r] = curr_state[i];
}

for (int t = 0; t < n_tokens; t++) {
Expand All @@ -63,15 +65,24 @@ __global__ void gated_delta_net_cuda(const float * q,

const float beta_val = *beta_t;

// Cache k and q in registers
float k_reg[rows_per_lane];
float q_reg[rows_per_lane];
#pragma unroll
for (int r = 0; r < rows_per_lane; r++) {
const int i = r * warp_size + lane;
k_reg[r] = k_t[i];
q_reg[r] = q_t[i];
}

if constexpr (!KDA) {
const float g_val = expf(*g_t);

// kv[col] = (S^T @ k)[col] = sum_i S[i][col] * k[i]
float kv_shard = 0.0f;
#pragma unroll
for (int r = 0; r < rows_per_lane; r++) {
const int i = r * warp_size + lane;
kv_shard += s_shard[r] * k_t[i];
kv_shard += s_shard[r] * k_reg[r];
}
float kv_col = warp_reduce_sum<warp_size>(kv_shard);

Expand All @@ -83,9 +94,8 @@ __global__ void gated_delta_net_cuda(const float * q,
float attn_partial = 0.0f;
#pragma unroll
for (int r = 0; r < rows_per_lane; r++) {
const int i = r * warp_size + lane;
s_shard[r] = g_val * s_shard[r] + k_t[i] * delta_col;
attn_partial += s_shard[r] * q_t[i];
s_shard[r] = g_val * s_shard[r] + k_reg[r] * delta_col;
attn_partial += s_shard[r] * q_reg[r];
}

float attn_col = warp_reduce_sum<warp_size>(attn_partial);
Expand All @@ -99,7 +109,7 @@ __global__ void gated_delta_net_cuda(const float * q,
#pragma unroll
for (int r = 0; r < rows_per_lane; r++) {
const int i = r * warp_size + lane;
kv_shard += expf(g_t[i]) * s_shard[r] * k_t[i];
kv_shard += expf(g_t[i]) * s_shard[r] * k_reg[r];
}

float kv_col = warp_reduce_sum<warp_size>(kv_shard);
Expand All @@ -113,8 +123,8 @@ __global__ void gated_delta_net_cuda(const float * q,
#pragma unroll
for (int r = 0; r < rows_per_lane; r++) {
const int i = r * warp_size + lane;
s_shard[r] = expf(g_t[i]) * s_shard[r] + k_t[i] * delta_col;
attn_partial += s_shard[r] * q_t[i];
s_shard[r] = expf(g_t[i]) * s_shard[r] + k_reg[r] * delta_col;
attn_partial += s_shard[r] * q_reg[r];
}

float attn_col = warp_reduce_sum<warp_size>(attn_partial);
Expand Down
Loading