diff --git a/ggml/src/ggml-cpu/arch/s390/quants.c b/ggml/src/ggml-cpu/arch/s390/quants.c index 19d225a4837..34184ed8510 100644 --- a/ggml/src/ggml-cpu/arch/s390/quants.c +++ b/ggml/src/ggml-cpu/arch/s390/quants.c @@ -181,11 +181,11 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi const int8x16_t v_yh = vec_xl(QK8_0/2, y[ib].qs); const int16x8_t v_xylso = vec_mulo(v_xls, v_yl); - const int16x8_t v_xylse = vec_mule(v_xls, v_yl); + const int16x8_t v_xyl = vec_meadd(v_xls, v_yl, v_xylso); const int16x8_t v_xyhso = vec_mulo(v_xhs, v_yh); - const int16x8_t v_xyhse = vec_mule(v_xhs, v_yh); + const int16x8_t v_xyh = vec_meadd(v_xhs, v_yh, v_xyhso); - int16x8_t v_xy_ = v_xylso + v_xylse + v_xyhso + v_xyhse; v_xy_ += vec_reve(v_xy_); + int16x8_t v_xy_ = v_xyl + v_xyh; v_xy_ += vec_reve(v_xy_); const float32x4_t v_xy = vec_float(vec_unpackh(v_xy_)); const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x[ib].d) * GGML_CPU_FP16_TO_FP32(y[ib].d)); @@ -890,8 +890,7 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi const int16x8_t v_minsh = (int16x8_t)vec_unpackh((uint8x16_t)v_mins8); const int32x4_t v_minso = vec_mulo(v_ysums, v_minsh); - const int32x4_t v_minse = vec_mule(v_ysums, v_minsh); - const int32x4_t v_mins = v_minso + v_minse; + const int32x4_t v_mins = vec_meadd(v_ysums, v_minsh, v_minso); sumf -= dmin * (v_mins[0] + v_mins[1] + v_mins[2] + v_mins[3]); const uint8_t * scales = (const uint8_t *)utmp; @@ -1004,8 +1003,7 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi const int16x8_t v_minsh = (int16x8_t)vec_unpackh(v_mins8); const int32x4_t v_minsho = vec_mulo(v_ysums, v_minsh); - const int32x4_t v_minshe = vec_mule(v_ysums, v_minsh); - const int32x4_t v_mins = vec_add(v_minsho, v_minshe); + const int32x4_t v_mins = vec_meadd(v_ysums, v_minsh, v_minsho); const int32_t mins = vec_hsum_i32x4(v_mins); const uint8_t * scales = (const uint8_t *)utmp; @@ -1110,10 +1108,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi const int16x8_t v_scaleh = vec_unpackl(v_scale); const int32x4_t v_minslo = vec_mulo(v_ysumsl, v_scalel); - const int32x4_t v_minsle = vec_mule(v_ysumsl, v_scalel); + const int32x4_t v_minsl = vec_meadd(v_ysumsl, v_scalel, v_minslo); const int32x4_t v_minsho = vec_mulo(v_ysumsh, v_scaleh); - const int32x4_t v_minshe = vec_mule(v_ysumsh, v_scaleh); - const int32x4_t v_mins = v_minslo + v_minsle + v_minsho + v_minshe; + const int32x4_t v_minsh = vec_meadd(v_ysumsh, v_scaleh, v_minsho); + const int32x4_t v_mins = vec_add(v_minsl, v_minsh); const int32_t mins = vec_hsum_i32x4(v_mins); diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 0fae68628b6..23d6d39e0e8 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -590,6 +590,7 @@ struct vk_device_struct { vk_queue transfer_queue; bool single_queue; bool support_async; + bool async_use_transfer_queue; uint32_t subgroup_size; uint32_t subgroup_size_log2; uint32_t shader_core_count; @@ -1858,6 +1859,10 @@ struct ggml_backend_vk_context { vk_context_ref compute_ctx; + vk_context_ref transfer_ctx; + vk_semaphore transfer_semaphore; + uint64_t transfer_semaphore_last_submitted {}; + std::vector tensor_ctxs; std::vector descriptor_pools; @@ -1866,6 +1871,7 @@ struct ggml_backend_vk_context { uint32_t pipeline_descriptor_set_requirements {}; vk_command_pool compute_cmd_pool; + vk_command_pool transfer_cmd_pool; // number of additional consecutive nodes that are being fused with the // node currently being processed @@ -5391,13 +5397,19 @@ static vk_device ggml_vk_get_device(size_t idx) { ggml_vk_load_shaders(device); + const bool prefers_transfer_queue = device->vendor_id == VK_VENDOR_ID_AMD && device->architecture != AMD_GCN; + if (!device->single_queue) { const uint32_t transfer_queue_index = compute_queue_family_index == transfer_queue_family_index ? 1 : 0; ggml_vk_create_queue(device, device->transfer_queue, transfer_queue_family_index, transfer_queue_index, { vk::PipelineStageFlagBits::eTransfer }, true); + + device->async_use_transfer_queue = prefers_transfer_queue || (getenv("GGML_VK_ASYNC_USE_TRANSFER_QUEUE") != nullptr); } else { // TODO: Use pointer or reference to avoid copy device->transfer_queue.copyFrom(device->compute_queue); device->transfer_queue.cmd_pool.init(device, &device->transfer_queue); + + device->async_use_transfer_queue = false; } device->buffer_type = { @@ -5871,6 +5883,15 @@ static void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) { ctx->almost_ready_fence = ctx->device->device.createFence({}); ctx->compute_cmd_pool.init(ctx->device, &ctx->device->compute_queue); + if (ctx->device->async_use_transfer_queue) { + vk::SemaphoreTypeCreateInfo tci{ vk::SemaphoreType::eTimeline, 0 }; + vk::SemaphoreCreateInfo ci{}; + ci.setPNext(&tci); + ctx->transfer_semaphore.s = ctx->device->device.createSemaphore(ci); + ctx->transfer_semaphore.value = 0; + + ctx->transfer_cmd_pool.init(ctx->device, &ctx->device->transfer_queue); + } if (vk_perf_logger_enabled) { ctx->perf_logger = std::unique_ptr(new vk_perf_logger()); @@ -6419,6 +6440,47 @@ static void ggml_vk_ctx_begin(vk_device& device, vk_context& subctx) { subctx->s = subctx->seqs[subctx->seqs.size() - 1].data(); } +static vk_context ggml_vk_get_compute_ctx(ggml_backend_vk_context * ctx) { + if (!ctx->compute_ctx.expired()) { + return ctx->compute_ctx.lock(); + } + + vk_context result = ggml_vk_create_context(ctx, ctx->compute_cmd_pool); + + ctx->compute_ctx = result; + ggml_vk_ctx_begin(ctx->device, result); + + if (ctx->device->async_use_transfer_queue && ctx->transfer_semaphore_last_submitted < ctx->transfer_semaphore.value) { + result->s->wait_semaphores.push_back(ctx->transfer_semaphore); + ctx->transfer_semaphore_last_submitted = ctx->transfer_semaphore.value; + } + + return result; +} + +// Submit any pending transfer queue work and signal the transfer semaphore. +// The next compute context created via ggml_vk_get_compute_ctx will wait on this semaphore. +// Returns true if work was submitted. +static bool ggml_vk_submit_transfer_ctx(ggml_backend_vk_context * ctx) { + if (!ctx->device->async_use_transfer_queue || ctx->transfer_ctx.expired()) { + return false; + } + + vk_context cpy_ctx = ctx->transfer_ctx.lock(); + ggml_vk_ctx_end(cpy_ctx); + + for (auto& cpy : cpy_ctx->in_memcpys) { + memcpy(cpy.dst, cpy.src, cpy.n); + } + + ctx->transfer_semaphore.value++; + cpy_ctx->seqs.back().back().signal_semaphores.push_back(ctx->transfer_semaphore); + + ggml_vk_submit(cpy_ctx, {}); + ctx->transfer_ctx.reset(); + return true; +} + static size_t ggml_vk_align_size(size_t width, size_t align) { VK_LOG_DEBUG("ggml_vk_align_size(" << width << ", " << align << ")"); return CEIL_DIV(width, align) * align; @@ -7512,6 +7574,18 @@ static bool ggml_vk_should_use_mmvq(const vk_device& device, uint32_t m, uint32_ return false; } + if (device->driver_id == vk::DriverId::eIntelProprietaryWindows) { + // Intel Windows proprietary driver tuning + switch (src0_type) { + case GGML_TYPE_MXFP4: + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + return false; + default: + return true; + } + } + switch (src0_type) { // From tests on A770 Linux, may need more tuning case GGML_TYPE_Q4_0: @@ -12529,15 +12603,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr } } - vk_context compute_ctx; - - if (ctx->compute_ctx.expired()) { - compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool); - ctx->compute_ctx = compute_ctx; - ggml_vk_ctx_begin(ctx->device, compute_ctx); - } else { - compute_ctx = ctx->compute_ctx.lock(); - } + vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx); { // This logic detects dependencies between modes in the graph and calls ggml_vk_sync_buffers @@ -13055,6 +13121,9 @@ static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) { ctx->prealloc_x_need_sync = ctx->prealloc_y_need_sync = ctx->prealloc_split_k_need_sync = false; ggml_vk_command_pool_cleanup(ctx->device, ctx->compute_cmd_pool); + if (ctx->device->async_use_transfer_queue) { + ggml_vk_command_pool_cleanup(ctx->device, ctx->transfer_cmd_pool); + } for (size_t i = 0; i < ctx->gc.semaphores.size(); i++) { ctx->device->device.destroySemaphore({ ctx->gc.semaphores[i].s }); @@ -13116,6 +13185,11 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) { ctx->descriptor_sets.clear(); ctx->compute_cmd_pool.destroy(ctx->device->device); + if (ctx->device->async_use_transfer_queue) { + ctx->device->device.destroySemaphore(ctx->transfer_semaphore.s); + + ctx->transfer_cmd_pool.destroy(ctx->device->device); + } if (vk_perf_logger_enabled) { ctx->perf_logger->print_timings(true); } @@ -13387,34 +13461,38 @@ static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, ggml_tensor ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context; - vk_context compute_ctx; + vk_context cpy_ctx; - if (ctx->compute_ctx.expired()) { - // Initialize new transfer context - compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool); - ctx->compute_ctx = compute_ctx; - ggml_vk_ctx_begin(ctx->device, compute_ctx); + if (ctx->device->async_use_transfer_queue) { + if (ctx->transfer_ctx.expired()) { + // Initialize new transfer context + cpy_ctx = ggml_vk_create_context(ctx, ctx->transfer_cmd_pool); + ctx->transfer_ctx = cpy_ctx; + ggml_vk_ctx_begin(ctx->device, cpy_ctx); + } else { + cpy_ctx = ctx->transfer_ctx.lock(); + } } else { - compute_ctx = ctx->compute_ctx.lock(); + cpy_ctx = ggml_vk_get_compute_ctx(ctx); } vk_buffer buf = buf_ctx->dev_buffer; auto dst_offset = vk_tensor_offset(tensor) + tensor->view_offs + offset; - bool ret = ggml_vk_buffer_write_async(compute_ctx, buf, dst_offset, data, size); + bool ret = ggml_vk_buffer_write_async(cpy_ctx, buf, dst_offset, data, size); if (!ret) { ggml_vk_ensure_sync_staging_buffer(ctx, size); - ggml_vk_sync_buffers(nullptr, compute_ctx); + ggml_vk_sync_buffers(nullptr, cpy_ctx); vk::BufferCopy buffer_cpy; buffer_cpy.srcOffset = 0; buffer_cpy.dstOffset = dst_offset; buffer_cpy.size = size; - compute_ctx->s->buffer.copyBuffer(ctx->sync_staging->buffer, buf->buffer, { buffer_cpy }); - deferred_memcpy(ctx->sync_staging->ptr, data, size, &compute_ctx->in_memcpys); + cpy_ctx->s->buffer.copyBuffer(ctx->sync_staging->buffer, buf->buffer, { buffer_cpy }); + deferred_memcpy(ctx->sync_staging->ptr, data, size, &cpy_ctx->in_memcpys); ggml_vk_synchronize(ctx); } } @@ -13426,16 +13504,7 @@ static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_ ggml_backend_vk_buffer_context * buf_ctx = (ggml_backend_vk_buffer_context *)tensor->buffer->context; - vk_context compute_ctx; - - if (ctx->compute_ctx.expired()) { - // Initialize new transfer context - compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool); - ctx->compute_ctx = compute_ctx; - ggml_vk_ctx_begin(ctx->device, compute_ctx); - } else { - compute_ctx = ctx->compute_ctx.lock(); - } + vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx); vk_buffer buf = buf_ctx->dev_buffer; @@ -13458,31 +13527,60 @@ static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, const ggml_ } } -static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) { +static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) { VK_LOG_DEBUG("ggml_backend_vk_cpy_tensor_async()"); - ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; - if ((dst->buffer->buft == ggml_backend_vk_get_default_buffer_type(backend) || dst->buffer->buft == ggml_backend_vk_host_buffer_type()) && ggml_backend_buffer_is_vk(src->buffer)) { - ggml_backend_vk_buffer_context * src_buf_ctx = (ggml_backend_vk_buffer_context *)src->buffer->context; - ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context; + ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend_dst->context; + + if (dst->buffer->buft != ggml_backend_vk_get_default_buffer_type(backend_dst)) { + return false; + } - vk_context compute_ctx; + ggml_backend_vk_buffer_context * dst_buf_ctx = (ggml_backend_vk_buffer_context *)dst->buffer->context; + vk_buffer dst_buf = dst_buf_ctx->dev_buffer; - if (ctx->compute_ctx.expired()) { - // Initialize new transfer context - compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool); - ctx->compute_ctx = compute_ctx; - ggml_vk_ctx_begin(ctx->device, compute_ctx); - } else { - compute_ctx = ctx->compute_ctx.lock(); + if (ggml_backend_buffer_is_vk(src->buffer)) { + ggml_backend_vk_buffer_context * src_buf_ctx = (ggml_backend_vk_buffer_context *)src->buffer->context; + + // Async copy only works within the same device + if (src_buf_ctx->dev_buffer->device != dst_buf->device) { + return false; } - vk_buffer src_buf = src_buf_ctx->dev_buffer; - vk_buffer dst_buf = dst_buf_ctx->dev_buffer; + vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx); - ggml_vk_buffer_copy_async(compute_ctx, dst_buf, vk_tensor_offset(dst) + dst->view_offs, src_buf, vk_tensor_offset(src) + src->view_offs, ggml_nbytes(src)); + ggml_vk_buffer_copy_async(compute_ctx, dst_buf, vk_tensor_offset(dst) + dst->view_offs, + src_buf_ctx->dev_buffer, vk_tensor_offset(src) + src->view_offs, + ggml_nbytes(src)); return true; } + if (ggml_backend_buffer_is_host(src->buffer)) { + vk_buffer pinned_buf = nullptr; + size_t pinned_offset = 0; + ggml_vk_host_get(ctx->device, src->data, pinned_buf, pinned_offset); + if (pinned_buf == nullptr) { + return false; + } + + vk_context cpy_ctx; + if (ctx->device->async_use_transfer_queue) { + if (ctx->transfer_ctx.expired()) { + cpy_ctx = ggml_vk_create_context(ctx, ctx->transfer_cmd_pool); + ctx->transfer_ctx = cpy_ctx; + ggml_vk_ctx_begin(ctx->device, cpy_ctx); + } else { + cpy_ctx = ctx->transfer_ctx.lock(); + } + } else { + cpy_ctx = ggml_vk_get_compute_ctx(ctx); + } + + return ggml_vk_buffer_write_async(cpy_ctx, dst_buf, + vk_tensor_offset(dst) + dst->view_offs, + src->data, ggml_nbytes(src)); + } + + GGML_UNUSED(backend_src); return false; } @@ -13491,6 +13589,10 @@ static void ggml_vk_synchronize(ggml_backend_vk_context * ctx) { bool do_transfer = !ctx->compute_ctx.expired(); + if (ggml_vk_submit_transfer_ctx(ctx)) { + ctx->submit_pending = true; + } + vk_context compute_ctx; if (do_transfer) { compute_ctx = ctx->compute_ctx.lock(); @@ -13506,7 +13608,22 @@ static void ggml_vk_synchronize(ggml_backend_vk_context * ctx) { } if (ctx->submit_pending) { - { + if (ctx->device->async_use_transfer_queue && ctx->transfer_semaphore_last_submitted < ctx->transfer_semaphore.value) { + vk::TimelineSemaphoreSubmitInfo tl_info{ + 1, &ctx->transfer_semaphore.value, + 0, nullptr, + }; + vk::PipelineStageFlags stage = ctx->device->transfer_queue.stage_flags; + vk::SubmitInfo si{ + 1, &ctx->transfer_semaphore.s, &stage, + 0, nullptr, + 0, nullptr, + }; + si.setPNext(&tl_info); + std::lock_guard guard(queue_mutex); + ctx->device->compute_queue.queue.submit({ si }, ctx->fence); + ctx->transfer_semaphore_last_submitted = ctx->transfer_semaphore.value; + } else { std::lock_guard guard(queue_mutex); ctx->device->compute_queue.queue.submit({}, ctx->fence); } @@ -13972,6 +14089,8 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg bool first_node_in_batch = true; // true if next node will be first node in a batch int submit_node_idx = 0; // index to first node in a batch + ggml_vk_submit_transfer_ctx(ctx); + vk_context compute_ctx; if (vk_perf_logger_enabled) { // allocate/resize the query pool @@ -13997,9 +14116,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg std::fill(ctx->query_node_idx.begin(), ctx->query_node_idx.end(), 0); GGML_ASSERT(ctx->compute_ctx.expired()); - compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool); - ctx->compute_ctx = compute_ctx; - ggml_vk_ctx_begin(ctx->device, compute_ctx); + compute_ctx = ggml_vk_get_compute_ctx(ctx); ctx->query_idx = 0; compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->query_pool, ctx->query_idx++); } @@ -14009,13 +14126,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg if (ctx->prealloc_size_add_rms_partials) { ggml_vk_preallocate_buffers(ctx, nullptr); - if (ctx->compute_ctx.expired()) { - compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool); - ctx->compute_ctx = compute_ctx; - ggml_vk_ctx_begin(ctx->device, compute_ctx); - } else { - compute_ctx = ctx->compute_ctx.lock(); - } + compute_ctx = ggml_vk_get_compute_ctx(ctx); // initialize partial sums to zero. ggml_vk_buffer_memset_async(compute_ctx, ctx->prealloc_add_rms_partials, 0, 0, ctx->prealloc_size_add_rms_partials); ggml_vk_sync_buffers(ctx, compute_ctx); @@ -14238,13 +14349,7 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg bool enqueued = ggml_vk_build_graph(ctx, cgraph, i, cgraph->nodes[submit_node_idx], submit_node_idx, i + ctx->num_additional_fused_ops >= last_node, almost_ready, submit); if (vk_perf_logger_enabled && enqueued) { - if (ctx->compute_ctx.expired()) { - compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool); - ctx->compute_ctx = compute_ctx; - ggml_vk_ctx_begin(ctx->device, compute_ctx); - } else { - compute_ctx = ctx->compute_ctx.lock(); - } + compute_ctx = ggml_vk_get_compute_ctx(ctx); if (!vk_perf_logger_concurrent) { // track a single node/fusion for the current query ctx->query_nodes[ctx->query_idx] = cgraph->nodes[i]; @@ -14579,16 +14684,9 @@ static void ggml_backend_vk_event_record(ggml_backend_t backend, ggml_backend_ev ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; vk_event *vkev = (vk_event *)event->context; - vk_context compute_ctx; + ggml_vk_submit_transfer_ctx(ctx); - if (ctx->compute_ctx.expired()) { - // Initialize new transfer context - compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool); - ctx->compute_ctx = compute_ctx; - ggml_vk_ctx_begin(ctx->device, compute_ctx); - } else { - compute_ctx = ctx->compute_ctx.lock(); - } + vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx); // the backend interface doesn't have an explicit reset, so reset it here // before we record the command to set it @@ -14609,16 +14707,7 @@ static void ggml_backend_vk_event_wait(ggml_backend_t backend, ggml_backend_even ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context; vk_event *vkev = (vk_event *)event->context; - vk_context compute_ctx; - - if (ctx->compute_ctx.expired()) { - // Initialize new transfer context - compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool); - ctx->compute_ctx = compute_ctx; - ggml_vk_ctx_begin(ctx->device, compute_ctx); - } else { - compute_ctx = ctx->compute_ctx.lock(); - } + vk_context compute_ctx = ggml_vk_get_compute_ctx(ctx); ggml_vk_wait_events(compute_ctx, {vkev->event}); ggml_vk_ctx_end(compute_ctx); @@ -14631,7 +14720,7 @@ static ggml_backend_i ggml_backend_vk_interface = { /* .free = */ ggml_backend_vk_free, /* .set_tensor_async = */ ggml_backend_vk_set_tensor_async, /* .get_tensor_async = */ ggml_backend_vk_get_tensor_async, - /* .cpy_tensor_async = */ NULL, // ggml_backend_vk_cpy_tensor_async, + /* .cpy_tensor_async = */ ggml_backend_vk_cpy_tensor_async, /* .synchronize = */ ggml_backend_vk_synchronize, /* .graph_plan_create = */ NULL, /* .graph_plan_free = */ NULL, @@ -15367,11 +15456,25 @@ static bool ggml_backend_vk_device_supports_buft(ggml_backend_dev_t dev, ggml_ba return buft_ctx->device->idx == ctx->device; } +static int64_t ggml_vk_get_op_batch_size(const ggml_tensor * op) { + switch (op->op) { + case GGML_OP_GET_ROWS: + return 0; + case GGML_OP_MUL_MAT: + return op->ne[1]; + case GGML_OP_MUL_MAT_ID: + case GGML_OP_ROPE: + case GGML_OP_ROPE_BACK: + return op->ne[2]; + default: + return ggml_nrows(op); + } +} + static bool ggml_backend_vk_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) { ggml_backend_vk_device_context * dev_ctx = (ggml_backend_vk_device_context *)dev->context; - return (op->ne[1] >= dev_ctx->op_offload_min_batch_size && op->op != GGML_OP_GET_ROWS) || - (op->ne[2] >= dev_ctx->op_offload_min_batch_size && op->op == GGML_OP_MUL_MAT_ID); + return ggml_vk_get_op_batch_size(op) >= dev_ctx->op_offload_min_batch_size; } static ggml_backend_event_t ggml_backend_vk_device_event_new(ggml_backend_dev_t dev) { diff --git a/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp b/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp index 0d5a818dacb..369475eaf50 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu-shader-lib.hpp @@ -68,6 +68,7 @@ struct ggml_webgpu_shader_lib_context { size_t wg_mem_limit_bytes = 0; bool inplace = false; bool overlap = false; + bool src_overlap = false; bool supports_subgroup_matrix = false; uint32_t sg_mat_m = 0; uint32_t sg_mat_n = 0; @@ -179,9 +180,10 @@ struct ggml_webgpu_binary_pipeline_key { int op; bool inplace; bool overlap; + bool src_overlap; bool operator==(const ggml_webgpu_binary_pipeline_key & other) const { - return type == other.type && op == other.op && inplace == other.inplace && overlap == other.overlap; + return type == other.type && op == other.op && inplace == other.inplace && overlap == other.overlap && src_overlap == other.src_overlap; } }; @@ -192,6 +194,7 @@ struct ggml_webgpu_binary_pipeline_key_hash { ggml_webgpu_hash_combine(seed, key.op); ggml_webgpu_hash_combine(seed, key.inplace); ggml_webgpu_hash_combine(seed, key.overlap); + ggml_webgpu_hash_combine(seed, key.src_overlap); return seed; } }; @@ -1044,6 +1047,7 @@ class ggml_webgpu_shader_lib { .op = context.dst->op, .inplace = context.inplace, .overlap = context.overlap, + .src_overlap = context.src_overlap, }; auto it = binary_pipelines.find(key); @@ -1076,6 +1080,9 @@ class ggml_webgpu_shader_lib { } else if (key.overlap) { defines.push_back("OVERLAP"); variant += "_overlap"; + } else if (key.src_overlap) { + defines.push_back("SRC_OVERLAP"); + variant += "_src_overlap"; } defines.push_back(std::string("WG_SIZE=") + std::to_string(context.max_wg_size)); diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index 1c00d3cb2b1..913cf7f8825 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -133,12 +133,28 @@ struct webgpu_buf_pool { // which can run on a different thread than the calling thread. std::mutex mutex; std::condition_variable cv; + size_t cur_pool_size; + size_t max_pool_size; + wgpu::Device device; + wgpu::BufferUsage host_buf_usage; + wgpu::BufferUsage dev_buf_usage; + size_t buf_size; + bool should_grow; void init(wgpu::Device device, int num_bufs, size_t buf_size, wgpu::BufferUsage dev_buf_usage, - wgpu::BufferUsage host_buf_usage) { + wgpu::BufferUsage host_buf_usage, + bool should_grow = false, + size_t max_pool_size = WEBGPU_NUM_PARAM_BUFS * 2) { + this->max_pool_size = max_pool_size; + this->cur_pool_size = num_bufs; + this->device = device; + this->host_buf_usage = host_buf_usage; + this->dev_buf_usage = dev_buf_usage; + this->buf_size = buf_size; + this->should_grow = should_grow; for (int i = 0; i < num_bufs; i++) { wgpu::Buffer host_buf; wgpu::Buffer dev_buf; @@ -150,6 +166,25 @@ struct webgpu_buf_pool { webgpu_pool_bufs alloc_bufs() { std::unique_lock lock(mutex); + if (!free.empty()) { + webgpu_pool_bufs bufs = free.back(); + free.pop_back(); + return bufs; + } + + // Try growing the pool if no free buffers + if (free.empty() && cur_pool_size < max_pool_size && should_grow) { + cur_pool_size++; + wgpu::Buffer host_buf; + wgpu::Buffer dev_buf; + ggml_webgpu_create_buffer(device, host_buf, buf_size, host_buf_usage, "ggml_webgpu_host_pool_buf"); + ggml_webgpu_create_buffer(device, dev_buf, buf_size, dev_buf_usage, "ggml_webgpu_dev_pool_buf"); + + if (!(host_buf && dev_buf)) { + GGML_ABORT("webgpu_buf_pool: failed to allocate buffers"); + } + return webgpu_pool_bufs{ host_buf, dev_buf }; + } cv.wait(lock, [this] { return !free.empty(); }); webgpu_pool_bufs bufs = free.back(); free.pop_back(); @@ -243,6 +278,7 @@ struct webgpu_gpu_profile_buf_pool { #endif struct webgpu_command { + uint32_t num_kernels; wgpu::CommandBuffer commands; std::vector params_bufs; std::optional set_rows_error_bufs; @@ -280,7 +316,6 @@ struct webgpu_global_context_struct { webgpu_buf_pool memset_buf_pool; std::map memset_pipelines; // variant or type index - std::atomic_uint inflight_threads = 0; #ifdef GGML_WEBGPU_CPU_PROFILE // Profiling: labeled CPU time in ms (total) @@ -426,13 +461,9 @@ static void ggml_webgpu_create_buffer(wgpu::Device & device, static void ggml_backend_webgpu_wait(webgpu_global_context & ctx, std::vector & futures, bool block = true) { - // If we have too many in-flight submissions, wait on the oldest one first. If - // there are many threads, inflight_max may be 0, meaning that we must wait on - // all futures. - uint64_t timeout_ms = block ? UINT64_MAX : 0; - uint32_t inflight_threads = ctx->inflight_threads; - uint32_t inflight_max = WEBGPU_MAX_INFLIGHT_SUBS_PER_THREAD / std::max(inflight_threads, 1u); - while (futures.size() >= inflight_max && futures.size() > 0) { + // If we have too many in-flight submissions, wait on the oldest one first. + uint64_t timeout_ms = block ? UINT64_MAX : 0; + while (futures.size() >= WEBGPU_MAX_INFLIGHT_SUBS_PER_THREAD) { ctx->instance.WaitAny(futures[0].futures.size(), futures[0].futures.data(), UINT64_MAX); futures.erase(futures.begin()); } @@ -651,6 +682,7 @@ static webgpu_command ggml_backend_webgpu_build_multi( result.commands = commands; result.params_bufs = params_bufs_list; result.set_rows_error_bufs = set_rows_error_bufs; + result.num_kernels = pipelines.size(); #ifdef GGML_WEBGPU_GPU_PROFILE result.timestamp_query_bufs = ts_bufs; // TODO: handle multiple pipeline names @@ -788,6 +820,7 @@ static bool ggml_webgpu_tensor_overlap(ggml_tensor * a, ggml_tensor * b) { struct binary_overlap_flags { bool inplace; // src0 == dst bool overlap; // src1 == dst + bool src_overlap; }; static binary_overlap_flags ggml_webgpu_detect_binary_overlap(ggml_tensor * src0, @@ -796,6 +829,7 @@ static binary_overlap_flags ggml_webgpu_detect_binary_overlap(ggml_tensor * src0 binary_overlap_flags flags = {}; flags.inplace = ggml_webgpu_tensor_equal(src0, dst); flags.overlap = ggml_webgpu_tensor_overlap(src1, dst); + flags.src_overlap = ggml_webgpu_tensor_overlap(src0, src1); return flags; } @@ -1353,6 +1387,7 @@ static webgpu_command ggml_webgpu_binary_op(webgpu_context & ctx, .max_wg_size = ctx->global_ctx->capabilities.limits.maxComputeInvocationsPerWorkgroup, .inplace = flags.inplace, .overlap = flags.overlap, + .src_overlap = flags.src_overlap, }; webgpu_pipeline pipeline = ctx->shader_lib->get_binary_pipeline(shader_lib_ctx); @@ -1361,11 +1396,28 @@ static webgpu_command ggml_webgpu_binary_op(webgpu_context & ctx, uint32_t ne = (uint32_t) ggml_nelements(dst); + size_t src0_webgpu_tensor_align_offset = ggml_webgpu_tensor_align_offset(ctx, src0); + size_t src1_webgpu_tensor_align_offset = ggml_webgpu_tensor_align_offset(ctx, src1); + + uint32_t offset_merged_src0 = 0; + uint32_t offset_merged_src1 = 0; + if (flags.src_overlap) { + size_t min_off = std::min(src0_webgpu_tensor_align_offset, src1_webgpu_tensor_align_offset); + offset_merged_src0 = (uint32_t) ((src0_webgpu_tensor_align_offset - min_off) / ggml_type_size(src0->type)); + offset_merged_src1 = (uint32_t) ((src1_webgpu_tensor_align_offset - min_off) / ggml_type_size(src0->type)); + } + std::vector params = { ne, (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src0) / ggml_type_size(src0->type)), (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src1) / ggml_type_size(src1->type)), - (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)), + (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)), + offset_merged_src0, + offset_merged_src1, + (uint32_t) (src0->nb[0] / ggml_type_size(src0->type)), + (uint32_t) (src0->nb[1] / ggml_type_size(src0->type)), + (uint32_t) (src0->nb[2] / ggml_type_size(src0->type)), + (uint32_t) (src0->nb[3] / ggml_type_size(src0->type)), (uint32_t) (src1->nb[0] / ggml_type_size(src1->type)), (uint32_t) (src1->nb[1] / ggml_type_size(src1->type)), (uint32_t) (src1->nb[2] / ggml_type_size(src1->type)), @@ -1381,25 +1433,43 @@ static webgpu_command ggml_webgpu_binary_op(webgpu_context & ctx, std::vector entries; - entries.push_back({ - .binding = 0, - .buffer = ggml_webgpu_tensor_buf(src0), - .offset = ggml_webgpu_tensor_align_offset(ctx, src0), - .size = ggml_webgpu_tensor_binding_size(ctx, src0), - }); - - entries.push_back({ - .binding = 1, - .buffer = ggml_webgpu_tensor_buf(src1), - .offset = ggml_webgpu_tensor_align_offset(ctx, src1), - .size = ggml_webgpu_tensor_binding_size(ctx, src1), - }); - - if (!flags.inplace && !flags.overlap) { - entries.push_back({ .binding = 2, - .buffer = ggml_webgpu_tensor_buf(dst), - .offset = ggml_webgpu_tensor_align_offset(ctx, dst), - .size = ggml_webgpu_tensor_binding_size(ctx, dst) }); + if (flags.src_overlap) { + size_t merged_offset = std::min(src0_webgpu_tensor_align_offset, src1_webgpu_tensor_align_offset); + size_t merged_end = std::max(src0_webgpu_tensor_align_offset + ggml_webgpu_tensor_binding_size(ctx, src0), + src1_webgpu_tensor_align_offset + ggml_webgpu_tensor_binding_size(ctx, src1)); + entries.push_back({ + .binding = 0, + .buffer = ggml_webgpu_tensor_buf(src0), + .offset = merged_offset, + .size = merged_end - merged_offset, + }); + entries.push_back({ + .binding = 1, + .buffer = ggml_webgpu_tensor_buf(dst), + .offset = ggml_webgpu_tensor_align_offset(ctx, dst), + .size = ggml_webgpu_tensor_binding_size(ctx, dst), + }); + } else { + entries.push_back({ + .binding = 0, + .buffer = ggml_webgpu_tensor_buf(src0), + .offset = src0_webgpu_tensor_align_offset, + .size = ggml_webgpu_tensor_binding_size(ctx, src0), + }); + entries.push_back({ + .binding = 1, + .buffer = ggml_webgpu_tensor_buf(src1), + .offset = src1_webgpu_tensor_align_offset, + .size = ggml_webgpu_tensor_binding_size(ctx, src1), + }); + if (!flags.inplace && !flags.overlap) { + entries.push_back({ + .binding = 2, + .buffer = ggml_webgpu_tensor_buf(dst), + .offset = ggml_webgpu_tensor_align_offset(ctx, dst), + .size = ggml_webgpu_tensor_binding_size(ctx, dst), + }); + } } uint32_t wg_x = CEIL_DIV(ne, decisions->wg_size); @@ -2043,19 +2113,17 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str WEBGPU_CPU_PROFILE_TOTAL_START(graph_compute); - ctx->global_ctx->inflight_threads++; - std::vector commands; std::vector futures; + uint32_t num_batched_kernels = 0; for (int i = 0; i < cgraph->n_nodes; i++) { if (auto cmd = ggml_webgpu_encode_node(ctx, cgraph->nodes[i])) { commands.push_back(*cmd); + num_batched_kernels += cmd.value().num_kernels; } - // compute the batch size based on the number of inflight threads - uint32_t inflight_threads = ctx->global_ctx->inflight_threads; - uint32_t batch_size = std::min(std::max(1u, WEBGPU_NUM_PARAM_BUFS / std::max(inflight_threads, 1u)), - WEBGPU_COMMAND_SUBMIT_BATCH_SIZE); - if (commands.size() >= batch_size) { + + if (num_batched_kernels >= WEBGPU_COMMAND_SUBMIT_BATCH_SIZE) { + num_batched_kernels = 0; futures.push_back(ggml_backend_webgpu_submit(ctx->global_ctx, commands, ctx->param_buf_pool, &ctx->set_rows_error_buf_pool)); // Process events and check for completed submissions @@ -2071,7 +2139,6 @@ static ggml_status ggml_backend_webgpu_graph_compute(ggml_backend_t backend, str } ggml_backend_webgpu_wait(ctx->global_ctx, futures); - ctx->global_ctx->inflight_threads--; WEBGPU_CPU_PROFILE_TOTAL_END(graph_compute, ctx->global_ctx); return GGML_STATUS_SUCCESS; } @@ -2689,7 +2756,7 @@ static webgpu_context initialize_webgpu_context(ggml_backend_dev_t dev) { webgpu_ctx->shader_lib = std::make_unique(dev_ctx->webgpu_global_ctx->device); webgpu_ctx->param_buf_pool.init(webgpu_ctx->global_ctx->device, WEBGPU_NUM_PARAM_BUFS, WEBGPU_PARAMS_BUF_SIZE_BYTES, wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::Uniform, - wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::MapWrite); + wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::MapWrite, true); webgpu_ctx->set_rows_error_buf_pool.init(webgpu_ctx->global_ctx->device, WEBGPU_NUM_SET_ROWS_ERROR_BUFS, WEBGPU_SET_ROWS_ERROR_BUF_SIZE_BYTES, wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::Storage, @@ -2816,10 +2883,8 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const case GGML_OP_SUB: case GGML_OP_MUL: case GGML_OP_DIV: - // TODO: support non-contiguous tensors, e.g. for MOE_EXPERT_REDUCE - // see https://github.com/ggml-org/llama.cpp/pull/16857 supports_op = (op->type == GGML_TYPE_F32 || op->type == GGML_TYPE_F16) && (src0->type == op->type) && - (src1->type == op->type) && ggml_is_contiguous(src0) && ggml_is_contiguous(src1); + (src1->type == op->type); break; case GGML_OP_CPY: case GGML_OP_CONT: diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl index 55dd66408a3..a748dc1b86c 100644 --- a/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl +++ b/ggml/src/ggml-webgpu/wgsl-shaders/binary.wgsl @@ -7,6 +7,13 @@ struct Params { offset_src0: u32, offset_src1: u32, offset_dst: u32, + offset_merged_src0: u32, + offset_merged_src1: u32, + + stride_src0_0: u32, + stride_src0_1: u32, + stride_src0_2: u32, + stride_src0_3: u32, stride_src1_0: u32, stride_src1_1: u32, @@ -23,6 +30,21 @@ struct Params { b_ne3: u32, }; +fn src0_index(_i: u32) -> u32 { + var i = _i; + let a_i3 = i / (params.a_ne2 * params.a_ne1 * params.a_ne0); + i = i % (params.a_ne2 * params.a_ne1 * params.a_ne0); + let a_i2 = i / (params.a_ne1 * params.a_ne0); + i = i % (params.a_ne1 * params.a_ne0); + let a_i1 = i / params.a_ne0; + let a_i0 = i % params.a_ne0; + + return a_i0 * params.stride_src0_0 + + a_i1 * params.stride_src0_1 + + a_i2 * params.stride_src0_2 + + a_i3 * params.stride_src0_3; +} + fn src1_index(_i: u32) -> u32 { var i = _i; let a_i3 = i / (params.a_ne2 * params.a_ne1 * params.a_ne0); @@ -53,17 +75,22 @@ fn src1_index(_i: u32) -> u32 { #define DataType f16 #endif +#ifdef SRC_OVERLAP @group(0) @binding(0) -var src0: array; +var merged_src: array; @group(0) @binding(1) -var src1 : array; +var dst: array; -#ifdef INPLACE @group(0) @binding(2) var params: Params; +#else +@group(0) @binding(0) +var src0: array; -#elif defined(OVERLAP) +@group(0) @binding(1) +var src1 : array; +#if defined(INPLACE) || defined(OVERLAP) @group(0) @binding(2) var params: Params; @@ -74,6 +101,7 @@ var dst: array; @group(0) @binding(3) var params: Params; #endif +#endif fn op(a: DataType, b: DataType) -> DataType { #ifdef OP_ADD @@ -87,13 +115,17 @@ fn op(a: DataType, b: DataType) -> DataType { #endif } -fn update(dst_i: u32, src0_i: u32, src1_i: u32){ +fn update(dst_i: u32, src0_i: u32, src1_i: u32) { +#ifdef SRC_OVERLAP + let result = op(merged_src[src0_i], merged_src[src1_i]); +#else let result = op(src0[src0_i], src1[src1_i]); +#endif #ifdef INPLACE - src0[dst_i] = result; + src0[src0_i] = result; #elif defined(OVERLAP) - src1[dst_i] = result; + src1[src1_i] = result; #else dst[dst_i] = result; #endif @@ -102,6 +134,8 @@ fn update(dst_i: u32, src0_i: u32, src1_i: u32){ @compute @workgroup_size(WG_SIZE) fn main(@builtin(global_invocation_id) gid: vec3) { if (gid.x < params.ne) { - update(params.offset_dst + gid.x, params.offset_src0 + gid.x, params.offset_src1 + src1_index(gid.x)); + let src0_i = params.offset_src0 + params.offset_merged_src0 + src0_index(gid.x); + let src1_i = params.offset_src1 + params.offset_merged_src1 + src1_index(gid.x); + update(params.offset_dst + gid.x, src0_i, src1_i); } } diff --git a/scripts/get-wikitext-2.sh b/scripts/get-wikitext-2.sh index 67b0b0118b4..bd03ad35263 100755 --- a/scripts/get-wikitext-2.sh +++ b/scripts/get-wikitext-2.sh @@ -1,11 +1,43 @@ -#!/usr/bin/env bash +#!/bin/sh +# vim: set ts=4 sw=4 et: -wget https://huggingface.co/datasets/ggml-org/ci/resolve/main/wikitext-2-raw-v1.zip -unzip wikitext-2-raw-v1.zip +ZIP="wikitext-2-raw-v1.zip" +FILE="wikitext-2-raw/wiki.test.raw" +URL="https://huggingface.co/datasets/ggml-org/ci/resolve/main/$ZIP" -echo "Usage:" -echo "" -echo " ./llama-perplexity -m model.gguf -f wikitext-2-raw/wiki.test.raw [other params]" -echo "" +die() { + printf "%s\n" "$@" >&2 + exit 1 +} -exit 0 +have_cmd() { + for cmd; do + command -v "$cmd" >/dev/null || return + done +} + +dl() { + [ -f "$2" ] && return + if have_cmd wget; then + wget "$1" -O "$2" + elif have_cmd curl; then + curl -L "$1" -o "$2" + else + die "Please install wget or curl" + fi +} + +have_cmd unzip || die "Please install unzip" + +if [ ! -f "$FILE" ]; then + dl "$URL" "$ZIP" || exit + unzip -o "$ZIP" || exit + rm -f -- "$ZIP" +fi + +cat < nr; int nf; // number of fused ops, nf == 1 -> single op (no fusion) bool perm1; // permute src1? + bool src_overlap; // src0 and src1 are overlapping views of the same buffer bool run_whole_graph() override { return nf > 1; } @@ -2992,8 +2993,8 @@ struct test_bin_bcast : public test_case { std::array ne = {10, 10, 1, 1}, std::array nr = {1, 2, 1, 1}, int nf = 1, - bool perm1 = false) - : op(op), type(type), ne(ne), nr(nr), nf(nf), perm1(perm1) {} + bool perm1 = false, bool src_overlap = false) + : op(op), type(type), ne(ne), nr(nr), nf(nf), perm1(perm1), src_overlap(src_overlap) {} ggml_tensor * build_graph(ggml_context * ctx) override { GGML_ASSERT(nf <= 16); @@ -3008,6 +3009,8 @@ struct test_bin_bcast : public test_case { b[i] = ggml_new_tensor_4d(ctx, type, ne[p[0]], ne[p[1]], ne[p[2]], ne[p[3]]); b[i] = ggml_permute(ctx, b[i], p[0], p[1], p[2], p[3]); + } else if (src_overlap) { + b[i] = ggml_view_4d(ctx, a, ne[0], ne[1], ne[2], 2 * (ne[3] / 3), a->nb[1], a->nb[2], a->nb[3], (ne[3] / 3) * a->nb[3]); } else { b[i] = ggml_new_tensor(ctx, type, 4, ne.data()); } @@ -3021,7 +3024,13 @@ struct test_bin_bcast : public test_case { ggml_set_param(b[0]); } - ggml_tensor * out = a; + ggml_tensor *out; + + if (src_overlap) { + out = ggml_view_4d(ctx, a, ne[0], ne[1], ne[2], 2 * (ne[3] / 3), a->nb[1], a->nb[2], a->nb[3], 0); + } else { + out = a; + } for (int i = 0; i < nf; ++i) { out = op(ctx, out, b[i]); @@ -7527,9 +7536,9 @@ static std::vector> make_test_cases_eval() { } } - auto add_test_bin_bcast = [&](ggml_type type, std::array ne, std::array nr, bool perm1 = false) { + auto add_test_bin_bcast = [&](ggml_type type, std::array ne, std::array nr, bool perm1 = false, bool src_overlap = false) { for (auto op : {ggml_add, ggml_sub, ggml_mul, ggml_div}) { - test_cases.emplace_back(new test_bin_bcast(op, type, ne, nr, 1, perm1)); + test_cases.emplace_back(new test_bin_bcast(op, type, ne, nr, 1, perm1, src_overlap)); } }; for (ggml_type type : {GGML_TYPE_F16, GGML_TYPE_F32}) { @@ -7549,6 +7558,12 @@ static std::vector> make_test_cases_eval() { add_test_bin_bcast(type, {10, 5, 4, 3}, {2, 2, 2, 2}, perm1); } + // src_overlap + add_test_bin_bcast(type, {10, 5, 4, 6}, {1, 1, 1, 1}, false, true); + add_test_bin_bcast(type, {10, 5, 4, 5}, {1, 1, 1, 1}, false, true); + add_test_bin_bcast(type, {1, 1, 120, 120}, {1, 1, 1, 1}, false, true); + add_test_bin_bcast(type, {1, 1, 4, 320}, {1, 1, 1, 1}, false, true); + // test case for k_bin_bcast_unravel in CUDA backend add_test_bin_bcast(type, {1, 1, 65536, 1}, {256, 1, 1, 1});