Spaces:
Running
Running
lhez
commited on
Commit
·
5e203ec
1
Parent(s):
e62ef85
opencl: add `set_rows` for `f16` and `f32` (llama/14547)
Browse files* opencl: add `set_rows` for `f16` and `f32`
* opencl: better choose workgroup size for `set_rows`
ggml/src/ggml-opencl/CMakeLists.txt
CHANGED
|
@@ -88,6 +88,7 @@ set(GGML_OPENCL_KERNELS
|
|
| 88 |
rms_norm
|
| 89 |
rope
|
| 90 |
scale
|
|
|
|
| 91 |
sigmoid
|
| 92 |
silu
|
| 93 |
softmax_4_f32
|
|
|
|
| 88 |
rms_norm
|
| 89 |
rope
|
| 90 |
scale
|
| 91 |
+
set_rows
|
| 92 |
sigmoid
|
| 93 |
silu
|
| 94 |
softmax_4_f32
|
ggml/src/ggml-opencl/ggml-opencl.cpp
CHANGED
|
@@ -351,6 +351,7 @@ struct ggml_backend_opencl_context {
|
|
| 351 |
cl_program program_gemv_noshuffle_general;
|
| 352 |
cl_program program_gemv_noshuffle;
|
| 353 |
cl_program program_get_rows;
|
|
|
|
| 354 |
cl_program program_glu;
|
| 355 |
cl_program program_im2col_f16;
|
| 356 |
cl_program program_im2col_f32;
|
|
@@ -412,6 +413,7 @@ struct ggml_backend_opencl_context {
|
|
| 412 |
cl_kernel kernel_soft_max, kernel_soft_max_4;
|
| 413 |
cl_kernel kernel_soft_max_f16, kernel_soft_max_4_f16;
|
| 414 |
cl_kernel kernel_get_rows_f32, kernel_get_rows_f16, kernel_get_rows_q4_0;
|
|
|
|
| 415 |
cl_kernel kernel_rope_norm_f32, kernel_rope_norm_f16, kernel_rope_neox_f32, kernel_rope_neox_f16;
|
| 416 |
cl_kernel kernel_rope_multi_f32, kernel_rope_multi_f16, kernel_rope_vision_f32, kernel_rope_vision_f16;
|
| 417 |
cl_kernel kernel_cpy_f16_f16, kernel_cpy_f16_f32, kernel_cpy_f32_f16, kernel_cpy_f32_f32;
|
|
@@ -529,6 +531,16 @@ struct ggml_backend_opencl_context {
|
|
| 529 |
fclose(ftrace);
|
| 530 |
}
|
| 531 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 532 |
void enqueue_ndrange_kernel(cl_kernel kernel, cl_uint work_dim, size_t *global_work_size, size_t *local_work_size, const ggml_tensor * tensor) {
|
| 533 |
#ifdef GGML_OPENCL_PROFILING
|
| 534 |
cl_event evt;
|
|
@@ -1431,6 +1443,23 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
|
| 1431 |
}
|
| 1432 |
}
|
| 1433 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1434 |
// mul_mv_id_q4_0_f32_8x_flat
|
| 1435 |
{
|
| 1436 |
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
@@ -2233,8 +2262,17 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
|
| 2233 |
{
|
| 2234 |
// TODO: add support
|
| 2235 |
// ref: https://github.com/ggml-org/llama.cpp/pull/14274
|
| 2236 |
-
|
| 2237 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2238 |
case GGML_OP_CPY:
|
| 2239 |
case GGML_OP_DUP:
|
| 2240 |
case GGML_OP_CONT:
|
|
@@ -3374,6 +3412,111 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c
|
|
| 3374 |
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
| 3375 |
}
|
| 3376 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3377 |
static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
| 3378 |
GGML_ASSERT(src0);
|
| 3379 |
GGML_ASSERT(src0->extra);
|
|
@@ -6388,6 +6531,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
|
|
| 6388 |
}
|
| 6389 |
func = ggml_cl_get_rows;
|
| 6390 |
break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6391 |
case GGML_OP_CPY:
|
| 6392 |
if (!any_on_device) {
|
| 6393 |
return false;
|
|
|
|
| 351 |
cl_program program_gemv_noshuffle_general;
|
| 352 |
cl_program program_gemv_noshuffle;
|
| 353 |
cl_program program_get_rows;
|
| 354 |
+
cl_program program_set_rows;
|
| 355 |
cl_program program_glu;
|
| 356 |
cl_program program_im2col_f16;
|
| 357 |
cl_program program_im2col_f32;
|
|
|
|
| 413 |
cl_kernel kernel_soft_max, kernel_soft_max_4;
|
| 414 |
cl_kernel kernel_soft_max_f16, kernel_soft_max_4_f16;
|
| 415 |
cl_kernel kernel_get_rows_f32, kernel_get_rows_f16, kernel_get_rows_q4_0;
|
| 416 |
+
cl_kernel kernel_set_rows_f32, kernel_set_rows_f16;
|
| 417 |
cl_kernel kernel_rope_norm_f32, kernel_rope_norm_f16, kernel_rope_neox_f32, kernel_rope_neox_f16;
|
| 418 |
cl_kernel kernel_rope_multi_f32, kernel_rope_multi_f16, kernel_rope_vision_f32, kernel_rope_vision_f16;
|
| 419 |
cl_kernel kernel_cpy_f16_f16, kernel_cpy_f16_f32, kernel_cpy_f32_f16, kernel_cpy_f32_f32;
|
|
|
|
| 531 |
fclose(ftrace);
|
| 532 |
}
|
| 533 |
|
| 534 |
+
size_t get_kernel_workgroup_size(cl_kernel kernel) const {
|
| 535 |
+
size_t workgroup_size = 0;
|
| 536 |
+
size_t ret_size = 0;
|
| 537 |
+
CL_CHECK(
|
| 538 |
+
clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_WORK_GROUP_SIZE,
|
| 539 |
+
sizeof(size_t), &workgroup_size, &ret_size));
|
| 540 |
+
GGML_ASSERT(sizeof(size_t) == ret_size);
|
| 541 |
+
return workgroup_size;
|
| 542 |
+
}
|
| 543 |
+
|
| 544 |
void enqueue_ndrange_kernel(cl_kernel kernel, cl_uint work_dim, size_t *global_work_size, size_t *local_work_size, const ggml_tensor * tensor) {
|
| 545 |
#ifdef GGML_OPENCL_PROFILING
|
| 546 |
cl_event evt;
|
|
|
|
| 1443 |
}
|
| 1444 |
}
|
| 1445 |
|
| 1446 |
+
// set_rows
|
| 1447 |
+
{
|
| 1448 |
+
#ifdef GGML_OPENCL_EMBED_KERNELS
|
| 1449 |
+
const std::string kernel_src {
|
| 1450 |
+
#include "set_rows.cl.h"
|
| 1451 |
+
};
|
| 1452 |
+
#else
|
| 1453 |
+
const std::string kernel_src = read_file("set_rows.cl");
|
| 1454 |
+
#endif
|
| 1455 |
+
backend_ctx->program_set_rows =
|
| 1456 |
+
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
| 1457 |
+
|
| 1458 |
+
CL_CHECK((backend_ctx->kernel_set_rows_f32 = clCreateKernel(backend_ctx->program_set_rows, "kernel_set_rows_f32", &err), err));
|
| 1459 |
+
CL_CHECK((backend_ctx->kernel_set_rows_f16 = clCreateKernel(backend_ctx->program_set_rows, "kernel_set_rows_f16", &err), err));
|
| 1460 |
+
GGML_LOG_CONT(".");
|
| 1461 |
+
}
|
| 1462 |
+
|
| 1463 |
// mul_mv_id_q4_0_f32_8x_flat
|
| 1464 |
{
|
| 1465 |
#ifdef GGML_OPENCL_EMBED_KERNELS
|
|
|
|
| 2262 |
{
|
| 2263 |
// TODO: add support
|
| 2264 |
// ref: https://github.com/ggml-org/llama.cpp/pull/14274
|
| 2265 |
+
if (op->src[0]->type != GGML_TYPE_F32) {
|
| 2266 |
+
return false;
|
| 2267 |
+
}
|
| 2268 |
+
switch (op->type) {
|
| 2269 |
+
case GGML_TYPE_F16:
|
| 2270 |
+
case GGML_TYPE_F32:
|
| 2271 |
+
return true;
|
| 2272 |
+
default:
|
| 2273 |
+
return false;
|
| 2274 |
+
}
|
| 2275 |
+
}
|
| 2276 |
case GGML_OP_CPY:
|
| 2277 |
case GGML_OP_DUP:
|
| 2278 |
case GGML_OP_CONT:
|
|
|
|
| 3412 |
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
| 3413 |
}
|
| 3414 |
|
| 3415 |
+
static void ggml_cl_set_rows(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
| 3416 |
+
GGML_ASSERT(src0);
|
| 3417 |
+
GGML_ASSERT(src0->extra);
|
| 3418 |
+
GGML_ASSERT(src1);
|
| 3419 |
+
GGML_ASSERT(src1->extra);
|
| 3420 |
+
GGML_ASSERT(dst);
|
| 3421 |
+
GGML_ASSERT(dst->extra);
|
| 3422 |
+
|
| 3423 |
+
// ne0 = ne00
|
| 3424 |
+
// ne2 = ne02
|
| 3425 |
+
// ne3 = ne03
|
| 3426 |
+
|
| 3427 |
+
const int ne01 = src0->ne[1];
|
| 3428 |
+
const int ne02 = src0->ne[2];
|
| 3429 |
+
const int ne03 = src0->ne[3];
|
| 3430 |
+
|
| 3431 |
+
const cl_ulong nb01 = src0->nb[1];
|
| 3432 |
+
const cl_ulong nb02 = src0->nb[2];
|
| 3433 |
+
const cl_ulong nb03 = src0->nb[3];
|
| 3434 |
+
|
| 3435 |
+
const int ne11 = src1->ne[1];
|
| 3436 |
+
const int ne12 = src1->ne[2];
|
| 3437 |
+
|
| 3438 |
+
const cl_ulong nb10 = src1->nb[0];
|
| 3439 |
+
const cl_ulong nb11 = src1->nb[1];
|
| 3440 |
+
const cl_ulong nb12 = src1->nb[2];
|
| 3441 |
+
|
| 3442 |
+
const int ne0 = dst->ne[0];
|
| 3443 |
+
|
| 3444 |
+
const cl_ulong nb1 = dst->nb[1];
|
| 3445 |
+
const cl_ulong nb2 = dst->nb[2];
|
| 3446 |
+
const cl_ulong nb3 = dst->nb[3];
|
| 3447 |
+
|
| 3448 |
+
const int nblk0 = ne0/ggml_blck_size(dst->type);
|
| 3449 |
+
|
| 3450 |
+
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
| 3451 |
+
|
| 3452 |
+
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
| 3453 |
+
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
| 3454 |
+
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
| 3455 |
+
|
| 3456 |
+
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
| 3457 |
+
cl_ulong offset1 = extra1->offset + src1->view_offs;
|
| 3458 |
+
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
| 3459 |
+
|
| 3460 |
+
cl_kernel kernel;
|
| 3461 |
+
|
| 3462 |
+
switch (dst->type) {
|
| 3463 |
+
case GGML_TYPE_F32:
|
| 3464 |
+
kernel = backend_ctx->kernel_set_rows_f32;
|
| 3465 |
+
break;
|
| 3466 |
+
case GGML_TYPE_F16:
|
| 3467 |
+
kernel = backend_ctx->kernel_set_rows_f16;
|
| 3468 |
+
break;
|
| 3469 |
+
default:
|
| 3470 |
+
GGML_ABORT("not implemented");
|
| 3471 |
+
}
|
| 3472 |
+
|
| 3473 |
+
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
| 3474 |
+
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
| 3475 |
+
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
| 3476 |
+
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
| 3477 |
+
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
| 3478 |
+
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
| 3479 |
+
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne01));
|
| 3480 |
+
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01));
|
| 3481 |
+
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02));
|
| 3482 |
+
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb03));
|
| 3483 |
+
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne11));
|
| 3484 |
+
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne12));
|
| 3485 |
+
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb10));
|
| 3486 |
+
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb11));
|
| 3487 |
+
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb12));
|
| 3488 |
+
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &nblk0));
|
| 3489 |
+
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb1));
|
| 3490 |
+
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb2));
|
| 3491 |
+
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb3));
|
| 3492 |
+
|
| 3493 |
+
int nth0 = 64;
|
| 3494 |
+
if (backend_ctx->gpu_family == INTEL) {
|
| 3495 |
+
nth0 = 32;
|
| 3496 |
+
} else if (backend_ctx->gpu_family == ADRENO) {
|
| 3497 |
+
nth0 = 64;
|
| 3498 |
+
}
|
| 3499 |
+
|
| 3500 |
+
int max_workgroup_size = backend_ctx->get_kernel_workgroup_size(kernel);
|
| 3501 |
+
while (nth0 < nblk0 && nth0 < max_workgroup_size) {
|
| 3502 |
+
nth0 *= 2;
|
| 3503 |
+
}
|
| 3504 |
+
|
| 3505 |
+
int rows_per_workgroup = 1;
|
| 3506 |
+
if (nth0 > nblk0) {
|
| 3507 |
+
rows_per_workgroup = nth0 / nblk0;
|
| 3508 |
+
nth0 = nblk0;
|
| 3509 |
+
}
|
| 3510 |
+
|
| 3511 |
+
size_t global_work_size[] = {
|
| 3512 |
+
(size_t)(ne01 + rows_per_workgroup - 1)/rows_per_workgroup*nth0,
|
| 3513 |
+
(size_t)ne02*rows_per_workgroup,
|
| 3514 |
+
(size_t)ne03};
|
| 3515 |
+
size_t local_work_size[] = {(size_t)nth0, (size_t)rows_per_workgroup, 1};
|
| 3516 |
+
|
| 3517 |
+
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
| 3518 |
+
}
|
| 3519 |
+
|
| 3520 |
static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
| 3521 |
GGML_ASSERT(src0);
|
| 3522 |
GGML_ASSERT(src0->extra);
|
|
|
|
| 6531 |
}
|
| 6532 |
func = ggml_cl_get_rows;
|
| 6533 |
break;
|
| 6534 |
+
case GGML_OP_SET_ROWS:
|
| 6535 |
+
if (!any_on_device) {
|
| 6536 |
+
return false;
|
| 6537 |
+
}
|
| 6538 |
+
func = ggml_cl_set_rows;
|
| 6539 |
+
break;
|
| 6540 |
case GGML_OP_CPY:
|
| 6541 |
if (!any_on_device) {
|
| 6542 |
return false;
|
ggml/src/ggml-opencl/kernels/set_rows.cl
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
| 2 |
+
|
| 3 |
+
kernel void kernel_set_rows_f32(
|
| 4 |
+
global char * src0,
|
| 5 |
+
ulong offset0,
|
| 6 |
+
global char * src1,
|
| 7 |
+
ulong offset1,
|
| 8 |
+
global char * dst,
|
| 9 |
+
ulong offsetd,
|
| 10 |
+
int ne01,
|
| 11 |
+
ulong nb01,
|
| 12 |
+
ulong nb02,
|
| 13 |
+
ulong nb03,
|
| 14 |
+
int ne11,
|
| 15 |
+
int ne12,
|
| 16 |
+
ulong nb10,
|
| 17 |
+
ulong nb11,
|
| 18 |
+
ulong nb12,
|
| 19 |
+
int nblk0,
|
| 20 |
+
ulong nb1,
|
| 21 |
+
ulong nb2,
|
| 22 |
+
ulong nb3
|
| 23 |
+
) {
|
| 24 |
+
src0 = src0 + offset0;
|
| 25 |
+
src1 = src1 + offset1;
|
| 26 |
+
dst = dst + offsetd;
|
| 27 |
+
|
| 28 |
+
int i03 = get_group_id(2);
|
| 29 |
+
int i02 = get_group_id(1);
|
| 30 |
+
int i01 = get_group_id(0)*get_local_size(1) + get_local_id(1);
|
| 31 |
+
|
| 32 |
+
if (i01 >= ne01) {
|
| 33 |
+
return;
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
int i12 = i03%ne12;
|
| 37 |
+
int i11 = i02%ne11;
|
| 38 |
+
|
| 39 |
+
int i10 = i01;
|
| 40 |
+
long i1 = ((global long *)(src1 + i10*nb10 + i11*nb11 + i12*nb12))[0];
|
| 41 |
+
|
| 42 |
+
global float * dst_row = (global float *) (dst + i1*nb1 + i02*nb2 + i03*nb3);
|
| 43 |
+
global float * src_row = (global float *) (src0 + i01*nb01 + i02*nb02 + i03*nb03);
|
| 44 |
+
|
| 45 |
+
for (int ind = get_local_id(0); ind < nblk0; ind += get_local_size(0)) {
|
| 46 |
+
dst_row[ind] = (float)src_row[ind];
|
| 47 |
+
}
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
kernel void kernel_set_rows_f16(
|
| 51 |
+
global char * src0,
|
| 52 |
+
ulong offset0,
|
| 53 |
+
global char * src1,
|
| 54 |
+
ulong offset1,
|
| 55 |
+
global char * dst,
|
| 56 |
+
ulong offsetd,
|
| 57 |
+
int ne01,
|
| 58 |
+
ulong nb01,
|
| 59 |
+
ulong nb02,
|
| 60 |
+
ulong nb03,
|
| 61 |
+
int ne11,
|
| 62 |
+
int ne12,
|
| 63 |
+
ulong nb10,
|
| 64 |
+
ulong nb11,
|
| 65 |
+
ulong nb12,
|
| 66 |
+
int nblk0,
|
| 67 |
+
ulong nb1,
|
| 68 |
+
ulong nb2,
|
| 69 |
+
ulong nb3
|
| 70 |
+
) {
|
| 71 |
+
src0 = src0 + offset0;
|
| 72 |
+
src1 = src1 + offset1;
|
| 73 |
+
dst = dst + offsetd;
|
| 74 |
+
|
| 75 |
+
int i03 = get_group_id(2);
|
| 76 |
+
int i02 = get_group_id(1);
|
| 77 |
+
int i01 = get_group_id(0)*get_local_size(1) + get_local_id(1);
|
| 78 |
+
|
| 79 |
+
if (i01 >= ne01) {
|
| 80 |
+
return;
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
int i12 = i03%ne12;
|
| 84 |
+
int i11 = i02%ne11;
|
| 85 |
+
|
| 86 |
+
int i10 = i01;
|
| 87 |
+
long i1 = ((global long *)(src1 + i10*nb10 + i11*nb11 + i12*nb12))[0];
|
| 88 |
+
|
| 89 |
+
global half * dst_row = (global half *) (dst + i1*nb1 + i02*nb2 + i03*nb3);
|
| 90 |
+
global float * src_row = (global float *) (src0 + i01*nb01 + i02*nb02 + i03*nb03);
|
| 91 |
+
|
| 92 |
+
for (int ind = get_local_id(0); ind < nblk0; ind += get_local_size(0)) {
|
| 93 |
+
dst_row[ind] = src_row[ind];
|
| 94 |
+
}
|
| 95 |
+
}
|