Spaces:
Running
Running
lhez
commited on
Commit
·
d0a458b
1
Parent(s):
dcb106f
opencl: add `mul_mv_id_q4_0_f32_8x_flat` (llama/14003)
Browse files
ggml/src/ggml-opencl/CMakeLists.txt
CHANGED
|
@@ -80,6 +80,7 @@ set(GGML_OPENCL_KERNELS
|
|
| 80 |
mul_mv_q4_0_f32_1d_8x_flat
|
| 81 |
mul_mv_q4_0_f32_1d_16x_flat
|
| 82 |
mul_mv_q6_k
|
|
|
|
| 83 |
mul
|
| 84 |
norm
|
| 85 |
relu
|
|
|
|
| 80 |
mul_mv_q4_0_f32_1d_8x_flat
|
| 81 |
mul_mv_q4_0_f32_1d_16x_flat
|
| 82 |
mul_mv_q6_k
|
| 83 |
+
mul_mv_id_q4_0_f32_8x_flat
|
| 84 |
mul
|
| 85 |
norm
|
| 86 |
relu
|
ggml/src/ggml-opencl/ggml-opencl.cpp
CHANGED
|
@@ -321,6 +321,7 @@ struct ggml_backend_opencl_context {
|
|
| 321 |
cl_program program_upscale;
|
| 322 |
cl_program program_concat;
|
| 323 |
cl_program program_tsembd;
|
|
|
|
| 324 |
|
| 325 |
cl_kernel kernel_add, kernel_add_row;
|
| 326 |
cl_kernel kernel_mul, kernel_mul_row;
|
|
@@ -366,6 +367,7 @@ struct ggml_backend_opencl_context {
|
|
| 366 |
cl_kernel kernel_concat_f32_contiguous;
|
| 367 |
cl_kernel kernel_concat_f32_non_contiguous;
|
| 368 |
cl_kernel kernel_timestep_embedding;
|
|
|
|
| 369 |
|
| 370 |
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
| 371 |
// Transpose kernels
|
|
@@ -1112,7 +1114,7 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|
| 1112 |
GGML_LOG_CONT(".");
|
| 1113 |
}
|
| 1114 |
|
| 1115 |
-
|
| 1116 |
{
|
| 1117 |
#ifdef GGML_OPENCL_EMBED_KERNELS
|
| 1118 |
const std::string kernel_src {
|
|
@@ -1256,6 +1258,22 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|
| 1256 |
}
|
| 1257 |
}
|
| 1258 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1259 |
// Adreno kernels
|
| 1260 |
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
| 1261 |
// transpose
|
|
@@ -2178,6 +2196,13 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
|
| 2178 |
return op->src[1]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
|
| 2179 |
}
|
| 2180 |
return false;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2181 |
case GGML_OP_RESHAPE:
|
| 2182 |
case GGML_OP_VIEW:
|
| 2183 |
case GGML_OP_PERMUTE:
|
|
@@ -5536,6 +5561,136 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
|
|
| 5536 |
}
|
| 5537 |
}
|
| 5538 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5539 |
static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
| 5540 |
GGML_ASSERT(src0);
|
| 5541 |
GGML_ASSERT(src0->extra);
|
|
@@ -6444,6 +6599,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
|
|
| 6444 |
}
|
| 6445 |
func = ggml_cl_mul_mat;
|
| 6446 |
break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6447 |
case GGML_OP_SCALE:
|
| 6448 |
if (!any_on_device) {
|
| 6449 |
return false;
|
|
|
|
| 321 |
cl_program program_upscale;
|
| 322 |
cl_program program_concat;
|
| 323 |
cl_program program_tsembd;
|
| 324 |
+
cl_program program_mul_mv_id_q4_0_f32_8x_flat;
|
| 325 |
|
| 326 |
cl_kernel kernel_add, kernel_add_row;
|
| 327 |
cl_kernel kernel_mul, kernel_mul_row;
|
|
|
|
| 367 |
cl_kernel kernel_concat_f32_contiguous;
|
| 368 |
cl_kernel kernel_concat_f32_non_contiguous;
|
| 369 |
cl_kernel kernel_timestep_embedding;
|
| 370 |
+
cl_kernel kernel_mul_mv_id_q4_0_f32_8x_flat;
|
| 371 |
|
| 372 |
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
| 373 |
// Transpose kernels
|
|
|
|
| 1114 |
GGML_LOG_CONT(".");
|
| 1115 |
}
|
| 1116 |
|
| 1117 |
+
// repeat
|
| 1118 |
{
|
| 1119 |
#ifdef GGML_OPENCL_EMBED_KERNELS
|
| 1120 |
const std::string kernel_src {
|
|
|
|
| 1258 |
}
|
| 1259 |
}
|
| 1260 |
|
| 1261 |
+
// mul_mv_id_q4_0_f32_8x_flat
|
| 1262 |
+
{
|
| 1263 |
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
| 1264 |
+
const std::string kernel_src {
|
| 1265 |
+
#include "mul_mv_id_q4_0_f32_8x_flat.cl.h"
|
| 1266 |
+
};
|
| 1267 |
+
#else
|
| 1268 |
+
const std::string kernel_src = read_file("mul_mv_id_q4_0_f32_8x_flat.cl");
|
| 1269 |
+
#endif
|
| 1270 |
+
backend_ctx->program_mul_mv_id_q4_0_f32_8x_flat =
|
| 1271 |
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
| 1272 |
+
|
| 1273 |
+
CL_CHECK((backend_ctx->kernel_mul_mv_id_q4_0_f32_8x_flat = clCreateKernel(backend_ctx->program_mul_mv_id_q4_0_f32_8x_flat, "kernel_mul_mv_id_q4_0_f32_8x_flat", &err), err));
|
| 1274 |
+
GGML_LOG_CONT(".");
|
| 1275 |
+
}
|
| 1276 |
+
|
| 1277 |
// Adreno kernels
|
| 1278 |
#ifdef GGML_OPENCL_USE_ADRENO_KERNELS
|
| 1279 |
// transpose
|
|
|
|
| 2196 |
return op->src[1]->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
|
| 2197 |
}
|
| 2198 |
return false;
|
| 2199 |
+
case GGML_OP_MUL_MAT_ID:
|
| 2200 |
+
if (op->src[0]->type == GGML_TYPE_Q4_0) {
|
| 2201 |
+
if (op->src[1]->type == GGML_TYPE_F32) {
|
| 2202 |
+
return ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]);
|
| 2203 |
+
}
|
| 2204 |
+
}
|
| 2205 |
+
return false;
|
| 2206 |
case GGML_OP_RESHAPE:
|
| 2207 |
case GGML_OP_VIEW:
|
| 2208 |
case GGML_OP_PERMUTE:
|
|
|
|
| 5561 |
}
|
| 5562 |
}
|
| 5563 |
|
| 5564 |
+
static void ggml_cl_mul_mat_id(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
| 5565 |
+
GGML_ASSERT(src0);
|
| 5566 |
+
GGML_ASSERT(src0->extra);
|
| 5567 |
+
GGML_ASSERT(src1);
|
| 5568 |
+
GGML_ASSERT(src1->extra);
|
| 5569 |
+
GGML_ASSERT(dst);
|
| 5570 |
+
GGML_ASSERT(dst->extra);
|
| 5571 |
+
|
| 5572 |
+
const ggml_tensor * src2 = dst->src[2];
|
| 5573 |
+
GGML_ASSERT(src2);
|
| 5574 |
+
GGML_ASSERT(src2->extra);
|
| 5575 |
+
|
| 5576 |
+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
| 5577 |
+
cl_command_queue queue = backend_ctx->queue;
|
| 5578 |
+
|
| 5579 |
+
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
| 5580 |
+
ggml_tensor_extra_cl * extra2 = (ggml_tensor_extra_cl *)src2->extra;
|
| 5581 |
+
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
| 5582 |
+
|
| 5583 |
+
cl_ulong offset1 = extra1->offset + src1->view_offs;
|
| 5584 |
+
cl_ulong offset2 = extra2->offset + src2->view_offs;
|
| 5585 |
+
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
| 5586 |
+
|
| 5587 |
+
#ifdef GGML_OPENCL_SOA_Q
|
| 5588 |
+
ggml_tensor_extra_cl_q4_0 * extra0_q4_0 = (ggml_tensor_extra_cl_q4_0 *)src0->extra;
|
| 5589 |
+
#endif
|
| 5590 |
+
|
| 5591 |
+
const int ne00 = src0->ne[0];
|
| 5592 |
+
const int ne01 = src0->ne[1];
|
| 5593 |
+
const int ne02 = src0->ne[2];
|
| 5594 |
+
const int ne03 = src0->ne[3];
|
| 5595 |
+
|
| 5596 |
+
const cl_ulong nb00 = src0->nb[0];
|
| 5597 |
+
const cl_ulong nb02 = src0->nb[2];
|
| 5598 |
+
|
| 5599 |
+
const int ne10 = src1->ne[0];
|
| 5600 |
+
const int ne11 = src1->ne[1];
|
| 5601 |
+
const int ne12 = src1->ne[2];
|
| 5602 |
+
const int ne13 = src1->ne[3];
|
| 5603 |
+
|
| 5604 |
+
const cl_ulong nb11 = src1->nb[1];
|
| 5605 |
+
const cl_ulong nb12 = src1->nb[2];
|
| 5606 |
+
|
| 5607 |
+
const int ne20 = src2->ne[0];
|
| 5608 |
+
const int ne21 = src2->ne[1];
|
| 5609 |
+
|
| 5610 |
+
const cl_ulong nb21 = src2->nb[1];
|
| 5611 |
+
|
| 5612 |
+
const int ne0 = dst->ne[0];
|
| 5613 |
+
const int ne1 = dst->ne[1];
|
| 5614 |
+
|
| 5615 |
+
const int r2 = ne12/ne02;
|
| 5616 |
+
const int r3 = ne13/ne03;
|
| 5617 |
+
const int dst_rows = ne20*ne21; // ne20 = n_used_experts, ne21 = n_rows
|
| 5618 |
+
|
| 5619 |
+
GGML_ASSERT(ne00 == ne10);
|
| 5620 |
+
|
| 5621 |
+
int sgs = 32; // subgroup size
|
| 5622 |
+
int nsg = 1; // number of subgroups
|
| 5623 |
+
int nrows = 1; // number of row in src1
|
| 5624 |
+
int ndst = 4; // number of values produced by each subgroup
|
| 5625 |
+
|
| 5626 |
+
cl_kernel kernel;
|
| 5627 |
+
|
| 5628 |
+
// subgroup mat vec
|
| 5629 |
+
switch (src0->type) {
|
| 5630 |
+
case GGML_TYPE_Q4_0: {
|
| 5631 |
+
kernel = backend_ctx->kernel_mul_mv_id_q4_0_f32_8x_flat;
|
| 5632 |
+
|
| 5633 |
+
if (backend_ctx->gpu_family == INTEL) {
|
| 5634 |
+
sgs = 16;
|
| 5635 |
+
nsg = 1;
|
| 5636 |
+
ndst = 8;
|
| 5637 |
+
} else if (backend_ctx->gpu_family == ADRENO) {
|
| 5638 |
+
sgs = 64;
|
| 5639 |
+
nsg = 1;
|
| 5640 |
+
ndst = 8;
|
| 5641 |
+
} else {
|
| 5642 |
+
GGML_ASSERT(false && "TODO: Unknown GPU");
|
| 5643 |
+
}
|
| 5644 |
+
|
| 5645 |
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0_q4_0->q));
|
| 5646 |
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), &extra0_q4_0->d));
|
| 5647 |
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
| 5648 |
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
| 5649 |
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extra2->data_device));
|
| 5650 |
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offset2));
|
| 5651 |
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(cl_mem), &extrad->data_device));
|
| 5652 |
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &offsetd));
|
| 5653 |
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(int), &ne00));
|
| 5654 |
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(int), &ne01));
|
| 5655 |
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne02));
|
| 5656 |
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(cl_ulong), &nb00));
|
| 5657 |
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb02));
|
| 5658 |
+
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(int), &ne10));
|
| 5659 |
+
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(int), &ne11));
|
| 5660 |
+
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &ne12));
|
| 5661 |
+
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb11));
|
| 5662 |
+
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb12));
|
| 5663 |
+
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(int), &ne20));
|
| 5664 |
+
CL_CHECK(clSetKernelArg(kernel, 19, sizeof(int), &ne21));
|
| 5665 |
+
CL_CHECK(clSetKernelArg(kernel, 20, sizeof(cl_ulong), &nb21));
|
| 5666 |
+
CL_CHECK(clSetKernelArg(kernel, 21, sizeof(int), &ne0));
|
| 5667 |
+
CL_CHECK(clSetKernelArg(kernel, 22, sizeof(int), &ne1));
|
| 5668 |
+
CL_CHECK(clSetKernelArg(kernel, 23, sizeof(int), &r2));
|
| 5669 |
+
CL_CHECK(clSetKernelArg(kernel, 24, sizeof(int), &r3));
|
| 5670 |
+
|
| 5671 |
+
break;
|
| 5672 |
+
}
|
| 5673 |
+
default:
|
| 5674 |
+
GGML_ASSERT(false && "not implemented");;
|
| 5675 |
+
}
|
| 5676 |
+
|
| 5677 |
+
int _ne1 = 1;
|
| 5678 |
+
int ne123 = dst_rows;
|
| 5679 |
+
|
| 5680 |
+
size_t global_work_size[] = {(size_t)(ne01+ndst*nsg-1)/(ndst*nsg)*sgs, (size_t)(_ne1+nrows-1)/nrows*nsg, (size_t)ne123};
|
| 5681 |
+
size_t local_work_size[] = {(size_t)sgs, (size_t)nsg, 1};
|
| 5682 |
+
|
| 5683 |
+
#ifdef GGML_OPENCL_PROFILING
|
| 5684 |
+
cl_event evt;
|
| 5685 |
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &evt));
|
| 5686 |
+
|
| 5687 |
+
g_profiling_info.emplace_back();
|
| 5688 |
+
populateProfilingInfo(g_profiling_info.back(), evt, kernel, global_work_size, local_work_size, dst);
|
| 5689 |
+
#else
|
| 5690 |
+
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, NULL));
|
| 5691 |
+
#endif
|
| 5692 |
+
}
|
| 5693 |
+
|
| 5694 |
static void ggml_cl_scale(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
| 5695 |
GGML_ASSERT(src0);
|
| 5696 |
GGML_ASSERT(src0->extra);
|
|
|
|
| 6599 |
}
|
| 6600 |
func = ggml_cl_mul_mat;
|
| 6601 |
break;
|
| 6602 |
+
case GGML_OP_MUL_MAT_ID:
|
| 6603 |
+
if (!any_on_device) {
|
| 6604 |
+
return false;
|
| 6605 |
+
}
|
| 6606 |
+
func = ggml_cl_mul_mat_id;
|
| 6607 |
+
break;
|
| 6608 |
case GGML_OP_SCALE:
|
| 6609 |
if (!any_on_device) {
|
| 6610 |
return false;
|
ggml/src/ggml-opencl/kernels/mul_mv_id_q4_0_f32_8x_flat.cl
ADDED
|
@@ -0,0 +1,283 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
| 2 |
+
|
| 3 |
+
#ifdef cl_intel_subgroups
|
| 4 |
+
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
|
| 5 |
+
#else
|
| 6 |
+
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
|
| 7 |
+
#endif
|
| 8 |
+
|
| 9 |
+
#ifdef cl_intel_required_subgroup_size
|
| 10 |
+
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
|
| 11 |
+
#define INTEL_GPU 1
|
| 12 |
+
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
|
| 13 |
+
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
|
| 14 |
+
#elif defined(cl_qcom_reqd_sub_group_size)
|
| 15 |
+
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
|
| 16 |
+
#define ADRENO_GPU 1
|
| 17 |
+
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
|
| 18 |
+
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
|
| 19 |
+
#endif
|
| 20 |
+
|
| 21 |
+
#define QK4_0 32
|
| 22 |
+
|
| 23 |
+
typedef char int8_t;
|
| 24 |
+
typedef uchar uint8_t;
|
| 25 |
+
typedef short int16_t;
|
| 26 |
+
typedef ushort uint16_t;
|
| 27 |
+
typedef int int32_t;
|
| 28 |
+
typedef uint uint32_t;
|
| 29 |
+
|
| 30 |
+
//------------------------------------------------------------------------------
|
| 31 |
+
// block_q4_0
|
| 32 |
+
//------------------------------------------------------------------------------
|
| 33 |
+
struct block_q4_0
|
| 34 |
+
{
|
| 35 |
+
half d;
|
| 36 |
+
uint8_t qs[QK4_0 / 2];
|
| 37 |
+
};
|
| 38 |
+
|
| 39 |
+
// This function requires the original shuffled weights.
|
| 40 |
+
// As a reminder, the original weights are shuffled so that (q[0], q[16]) are
|
| 41 |
+
// packed together in a byte, so are (q[1], q[17]) and so on.
|
| 42 |
+
inline float block_q_4_0_dot_y_flat(
|
| 43 |
+
global uchar * x,
|
| 44 |
+
global half * dh,
|
| 45 |
+
float sumy,
|
| 46 |
+
float16 yl,
|
| 47 |
+
int il
|
| 48 |
+
) {
|
| 49 |
+
float d = *dh;
|
| 50 |
+
global ushort * qs = ((global ushort *)x + il/2);
|
| 51 |
+
float acc = 0.f;
|
| 52 |
+
|
| 53 |
+
acc += yl.s0 * (qs[0] & 0x000F);
|
| 54 |
+
acc += yl.s1 * (qs[0] & 0x0F00);
|
| 55 |
+
acc += yl.s8 * (qs[0] & 0x00F0);
|
| 56 |
+
acc += yl.s9 * (qs[0] & 0xF000);
|
| 57 |
+
|
| 58 |
+
acc += yl.s2 * (qs[1] & 0x000F);
|
| 59 |
+
acc += yl.s3 * (qs[1] & 0x0F00);
|
| 60 |
+
acc += yl.sa * (qs[1] & 0x00F0);
|
| 61 |
+
acc += yl.sb * (qs[1] & 0xF000);
|
| 62 |
+
|
| 63 |
+
acc += yl.s4 * (qs[2] & 0x000F);
|
| 64 |
+
acc += yl.s5 * (qs[2] & 0x0F00);
|
| 65 |
+
acc += yl.sc * (qs[2] & 0x00F0);
|
| 66 |
+
acc += yl.sd * (qs[2] & 0xF000);
|
| 67 |
+
|
| 68 |
+
acc += yl.s6 * (qs[3] & 0x000F);
|
| 69 |
+
acc += yl.s7 * (qs[3] & 0x0F00);
|
| 70 |
+
acc += yl.se * (qs[3] & 0x00F0);
|
| 71 |
+
acc += yl.sf * (qs[3] & 0xF000);
|
| 72 |
+
|
| 73 |
+
return d * (sumy * -8.f + acc);
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
//
|
| 77 |
+
// This variant outputs 8 values.
|
| 78 |
+
//
|
| 79 |
+
#undef N_DST
|
| 80 |
+
#undef N_SIMDGROUP
|
| 81 |
+
#undef N_SIMDWIDTH
|
| 82 |
+
|
| 83 |
+
#ifdef INTEL_GPU
|
| 84 |
+
#define N_DST 8 // each SIMD group works on 8 rows
|
| 85 |
+
#define N_SIMDGROUP 1 // number of SIMD groups in a thread group
|
| 86 |
+
#define N_SIMDWIDTH 16 // subgroup size
|
| 87 |
+
#elif defined (ADRENO_GPU)
|
| 88 |
+
#define N_DST 8
|
| 89 |
+
#define N_SIMDGROUP 1
|
| 90 |
+
#define N_SIMDWIDTH 64
|
| 91 |
+
#endif
|
| 92 |
+
|
| 93 |
+
inline void mul_vec_q_n_f32_8x_flat(
|
| 94 |
+
global char * src0_q,
|
| 95 |
+
global half * src0_d,
|
| 96 |
+
global float * src1,
|
| 97 |
+
global float * dst,
|
| 98 |
+
int ne00,
|
| 99 |
+
int ne01,
|
| 100 |
+
int ne02,
|
| 101 |
+
int ne10,
|
| 102 |
+
int ne12,
|
| 103 |
+
int ne0,
|
| 104 |
+
int ne1,
|
| 105 |
+
int r2,
|
| 106 |
+
int r3
|
| 107 |
+
) {
|
| 108 |
+
const ulong nb = ne00/QK4_0;
|
| 109 |
+
|
| 110 |
+
int r0 = get_group_id(0);
|
| 111 |
+
int r1 = get_group_id(1);
|
| 112 |
+
int im = 0;
|
| 113 |
+
|
| 114 |
+
int first_row = (r0 * N_SIMDGROUP + get_sub_group_id()) * N_DST;
|
| 115 |
+
|
| 116 |
+
int i12 = im%ne12;
|
| 117 |
+
int i13 = im/ne12;
|
| 118 |
+
|
| 119 |
+
// The number of scales is the same as the number of blocks.
|
| 120 |
+
ulong offset0_d = first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02);
|
| 121 |
+
// Each block contains QK4_0/2 uchars, hence offset for qs is as follows.
|
| 122 |
+
ulong offset0_q = (first_row * nb + (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02)) * QK4_0/2;
|
| 123 |
+
|
| 124 |
+
global uchar * x = (global uchar *) src0_q + offset0_q;
|
| 125 |
+
global half * d = (global half *) src0_d + offset0_d;
|
| 126 |
+
global float * y = (global float *) src1 + r1*ne10 + im*ne00*ne1;
|
| 127 |
+
|
| 128 |
+
float16 yl;
|
| 129 |
+
float8 sumf = 0.f;
|
| 130 |
+
|
| 131 |
+
int ix = get_sub_group_local_id()/2;
|
| 132 |
+
int il = 8*(get_sub_group_local_id()%2);
|
| 133 |
+
|
| 134 |
+
global float * yb = y + ix*QK4_0 + il;
|
| 135 |
+
|
| 136 |
+
for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/2) {
|
| 137 |
+
float sumy = 0.f;
|
| 138 |
+
|
| 139 |
+
sumy += yb[0];
|
| 140 |
+
sumy += yb[1];
|
| 141 |
+
sumy += yb[2];
|
| 142 |
+
sumy += yb[3];
|
| 143 |
+
sumy += yb[4];
|
| 144 |
+
sumy += yb[5];
|
| 145 |
+
sumy += yb[6];
|
| 146 |
+
sumy += yb[7];
|
| 147 |
+
|
| 148 |
+
sumy += yb[16];
|
| 149 |
+
sumy += yb[17];
|
| 150 |
+
sumy += yb[18];
|
| 151 |
+
sumy += yb[19];
|
| 152 |
+
sumy += yb[20];
|
| 153 |
+
sumy += yb[21];
|
| 154 |
+
sumy += yb[22];
|
| 155 |
+
sumy += yb[23];
|
| 156 |
+
|
| 157 |
+
yl.s0 = yb[0];
|
| 158 |
+
yl.s1 = yb[1]/256.f;
|
| 159 |
+
|
| 160 |
+
yl.s2 = yb[2];
|
| 161 |
+
yl.s3 = yb[3]/256.f;
|
| 162 |
+
|
| 163 |
+
yl.s4 = yb[4];
|
| 164 |
+
yl.s5 = yb[5]/256.f;
|
| 165 |
+
|
| 166 |
+
yl.s6 = yb[6];
|
| 167 |
+
yl.s7 = yb[7]/256.f;
|
| 168 |
+
|
| 169 |
+
yl.s8 = yb[16]/16.f;
|
| 170 |
+
yl.s9 = yb[17]/4096.f;
|
| 171 |
+
|
| 172 |
+
yl.sa = yb[18]/16.f;
|
| 173 |
+
yl.sb = yb[19]/4096.f;
|
| 174 |
+
|
| 175 |
+
yl.sc = yb[20]/16.f;
|
| 176 |
+
yl.sd = yb[21]/4096.f;
|
| 177 |
+
|
| 178 |
+
yl.se = yb[22]/16.f;
|
| 179 |
+
yl.sf = yb[23]/4096.f;
|
| 180 |
+
|
| 181 |
+
sumf.s0 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 0*nb*QK4_0/2, d + ib + 0*nb, sumy, yl, il);
|
| 182 |
+
sumf.s1 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 1*nb*QK4_0/2, d + ib + 1*nb, sumy, yl, il);
|
| 183 |
+
sumf.s2 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 2*nb*QK4_0/2, d + ib + 2*nb, sumy, yl, il);
|
| 184 |
+
sumf.s3 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 3*nb*QK4_0/2, d + ib + 3*nb, sumy, yl, il);
|
| 185 |
+
|
| 186 |
+
sumf.s4 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 4*nb*QK4_0/2, d + ib + 4*nb, sumy, yl, il);
|
| 187 |
+
sumf.s5 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 5*nb*QK4_0/2, d + ib + 5*nb, sumy, yl, il);
|
| 188 |
+
sumf.s6 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 6*nb*QK4_0/2, d + ib + 6*nb, sumy, yl, il);
|
| 189 |
+
sumf.s7 += block_q_4_0_dot_y_flat(x + ib*QK4_0/2 + 7*nb*QK4_0/2, d + ib + 7*nb, sumy, yl, il);
|
| 190 |
+
|
| 191 |
+
yb += QK4_0 * (N_SIMDWIDTH/2);
|
| 192 |
+
}
|
| 193 |
+
|
| 194 |
+
float8 tot = (float8)(
|
| 195 |
+
sub_group_reduce_add(sumf.s0), sub_group_reduce_add(sumf.s1),
|
| 196 |
+
sub_group_reduce_add(sumf.s2), sub_group_reduce_add(sumf.s3),
|
| 197 |
+
sub_group_reduce_add(sumf.s4), sub_group_reduce_add(sumf.s5),
|
| 198 |
+
sub_group_reduce_add(sumf.s6), sub_group_reduce_add(sumf.s7)
|
| 199 |
+
);
|
| 200 |
+
|
| 201 |
+
if (get_sub_group_local_id() == 0) {
|
| 202 |
+
if (first_row + 0 < ne01) {
|
| 203 |
+
dst[r1*ne0 + im*ne0*ne1 + first_row + 0] = tot.s0;
|
| 204 |
+
}
|
| 205 |
+
if (first_row + 1 < ne01) {
|
| 206 |
+
dst[r1*ne0 + im*ne0*ne1 + first_row + 1] = tot.s1;
|
| 207 |
+
}
|
| 208 |
+
if (first_row + 2 < ne01) {
|
| 209 |
+
dst[r1*ne0 + im*ne0*ne1 + first_row + 2] = tot.s2;
|
| 210 |
+
}
|
| 211 |
+
if (first_row + 3 < ne01) {
|
| 212 |
+
dst[r1*ne0 + im*ne0*ne1 + first_row + 3] = tot.s3;
|
| 213 |
+
}
|
| 214 |
+
|
| 215 |
+
if (first_row + 4 < ne01) {
|
| 216 |
+
dst[r1*ne0 + im*ne0*ne1 + first_row + 4] = tot.s4;
|
| 217 |
+
}
|
| 218 |
+
if (first_row + 5 < ne01) {
|
| 219 |
+
dst[r1*ne0 + im*ne0*ne1 + first_row + 5] = tot.s5;
|
| 220 |
+
}
|
| 221 |
+
if (first_row + 6 < ne01) {
|
| 222 |
+
dst[r1*ne0 + im*ne0*ne1 + first_row + 6] = tot.s6;
|
| 223 |
+
}
|
| 224 |
+
if (first_row + 7 < ne01) {
|
| 225 |
+
dst[r1*ne0 + im*ne0*ne1 + first_row + 7] = tot.s7;
|
| 226 |
+
}
|
| 227 |
+
}
|
| 228 |
+
}
|
| 229 |
+
|
| 230 |
+
#ifdef INTEL_GPU
|
| 231 |
+
REQD_SUBGROUP_SIZE_16
|
| 232 |
+
#elif defined (ADRENO_GPU)
|
| 233 |
+
REQD_SUBGROUP_SIZE_64
|
| 234 |
+
#endif
|
| 235 |
+
kernel void kernel_mul_mv_id_q4_0_f32_8x_flat(
|
| 236 |
+
global char * src0_q,
|
| 237 |
+
global half * src0_d,
|
| 238 |
+
global float * src1,
|
| 239 |
+
ulong offset1,
|
| 240 |
+
global char * src2,
|
| 241 |
+
ulong offset2,
|
| 242 |
+
global float * dst,
|
| 243 |
+
ulong offsetd,
|
| 244 |
+
int ne00,
|
| 245 |
+
int ne01,
|
| 246 |
+
int ne02,
|
| 247 |
+
ulong nb00,
|
| 248 |
+
ulong nb02,
|
| 249 |
+
int ne10,
|
| 250 |
+
int ne11,
|
| 251 |
+
int ne12,
|
| 252 |
+
ulong nb11,
|
| 253 |
+
ulong nb12,
|
| 254 |
+
int ne20,
|
| 255 |
+
int ne21,
|
| 256 |
+
ulong nb21,
|
| 257 |
+
int ne0,
|
| 258 |
+
int ne1,
|
| 259 |
+
int r2,
|
| 260 |
+
int r3
|
| 261 |
+
) {
|
| 262 |
+
src1 = (global float *)((global char *)src1 + offset1);
|
| 263 |
+
src2 = (global char *)((global char *)src2 + offset2);
|
| 264 |
+
dst = (global float *)((global char *)dst + offsetd);
|
| 265 |
+
|
| 266 |
+
const int iid1 = get_group_id(2)/ne20;
|
| 267 |
+
const int idx = get_group_id(2)%ne20;
|
| 268 |
+
|
| 269 |
+
const int i02 = ((global int *)(src2 + iid1*nb21))[idx];
|
| 270 |
+
|
| 271 |
+
const int i11 = idx%ne11;
|
| 272 |
+
const int i12 = iid1;
|
| 273 |
+
|
| 274 |
+
const int i1 = idx;
|
| 275 |
+
const int i2 = i12;
|
| 276 |
+
|
| 277 |
+
global char * src0_q_cur = src0_q + (i02*nb02/nb00)*(QK4_0/2);
|
| 278 |
+
global half * src0_d_cur = src0_d + (i02*nb02/nb00);
|
| 279 |
+
global float * src1_cur = (global float *)((global char *) src1 + i11*nb11 + i12*nb12);
|
| 280 |
+
global float * dst_cur = dst + i1*ne0 + i2*ne1*ne0;
|
| 281 |
+
|
| 282 |
+
mul_vec_q_n_f32_8x_flat(src0_q_cur, src0_d_cur, src1_cur, dst_cur, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3);
|
| 283 |
+
}
|