Spaces:
Running
Running
Akarshan Biswas
commited on
Commit
·
e62ef85
1
Parent(s):
92b2d32
SYCL: Initial set_rows kernel implementation (llama/14562)
Browse files* SYCL: Initial set_rows kernel implementation
* Revert max_threads to 256
* Refactor set_rows and address review comments
* Deduplicate conversion function
* Remove guard before kernel launch and refactor
* Fix and add back SFINAE
ggml/src/ggml-sycl/backend.hpp
CHANGED
|
@@ -30,6 +30,7 @@
|
|
| 30 |
#include "outprod.hpp"
|
| 31 |
#include "quants.hpp"
|
| 32 |
#include "rope.hpp"
|
|
|
|
| 33 |
#include "softmax.hpp"
|
| 34 |
#include "tsembd.hpp"
|
| 35 |
#include "wkv.hpp"
|
|
|
|
| 30 |
#include "outprod.hpp"
|
| 31 |
#include "quants.hpp"
|
| 32 |
#include "rope.hpp"
|
| 33 |
+
#include "set_rows.hpp"
|
| 34 |
#include "softmax.hpp"
|
| 35 |
#include "tsembd.hpp"
|
| 36 |
#include "wkv.hpp"
|
ggml/src/ggml-sycl/ggml-sycl.cpp
CHANGED
|
@@ -41,6 +41,7 @@
|
|
| 41 |
#include "ggml-sycl/element_wise.hpp"
|
| 42 |
#include "ggml-sycl/presets.hpp"
|
| 43 |
#include "ggml-sycl/gemm.hpp"
|
|
|
|
| 44 |
#include "ggml-sycl/sycl_hw.hpp"
|
| 45 |
#include "ggml-sycl/getrows.hpp"
|
| 46 |
#include "ggml.h"
|
|
@@ -3605,6 +3606,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
|
|
| 3605 |
case GGML_OP_GET_ROWS:
|
| 3606 |
ggml_sycl_get_rows(ctx, dst);
|
| 3607 |
break;
|
|
|
|
|
|
|
|
|
|
| 3608 |
case GGML_OP_DUP:
|
| 3609 |
ggml_sycl_dup(ctx, dst);
|
| 3610 |
break;
|
|
@@ -4299,7 +4303,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
|
|
| 4299 |
{
|
| 4300 |
// TODO: add support
|
| 4301 |
// ref: https://github.com/ggml-org/llama.cpp/pull/14274
|
| 4302 |
-
return
|
| 4303 |
} break;
|
| 4304 |
case GGML_OP_CPY:
|
| 4305 |
{
|
|
|
|
| 41 |
#include "ggml-sycl/element_wise.hpp"
|
| 42 |
#include "ggml-sycl/presets.hpp"
|
| 43 |
#include "ggml-sycl/gemm.hpp"
|
| 44 |
+
#include "ggml-sycl/set_rows.hpp"
|
| 45 |
#include "ggml-sycl/sycl_hw.hpp"
|
| 46 |
#include "ggml-sycl/getrows.hpp"
|
| 47 |
#include "ggml.h"
|
|
|
|
| 3606 |
case GGML_OP_GET_ROWS:
|
| 3607 |
ggml_sycl_get_rows(ctx, dst);
|
| 3608 |
break;
|
| 3609 |
+
case GGML_OP_SET_ROWS:
|
| 3610 |
+
ggml_sycl_op_set_rows(ctx, dst);
|
| 3611 |
+
break;
|
| 3612 |
case GGML_OP_DUP:
|
| 3613 |
ggml_sycl_dup(ctx, dst);
|
| 3614 |
break;
|
|
|
|
| 4303 |
{
|
| 4304 |
// TODO: add support
|
| 4305 |
// ref: https://github.com/ggml-org/llama.cpp/pull/14274
|
| 4306 |
+
return (op->type == GGML_TYPE_F32 || (op->type == GGML_TYPE_F16 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_I64));
|
| 4307 |
} break;
|
| 4308 |
case GGML_OP_CPY:
|
| 4309 |
{
|
ggml/src/ggml-sycl/set_rows.cpp
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#include "set_rows.hpp"
|
| 2 |
+
|
| 3 |
+
namespace utils {
|
| 4 |
+
template<typename T>
|
| 5 |
+
static constexpr bool is_arithmetic_v() {
|
| 6 |
+
return std::is_arithmetic_v<T> || std::is_same_v<T, sycl::half> || std::is_same_v<T, sycl::ext::oneapi::bfloat16>;
|
| 7 |
+
}
|
| 8 |
+
}
|
| 9 |
+
template<typename TIn, typename TOut>
|
| 10 |
+
static inline std::enable_if_t<utils::is_arithmetic_v<TIn>() && utils::is_arithmetic_v<TOut>(), void>
|
| 11 |
+
convert (const char* src, char* dst) {
|
| 12 |
+
auto src_val = *reinterpret_cast<const TIn*>(src);
|
| 13 |
+
auto dst_val = sycl::vec<TIn, 1>(src_val).template convert<TOut, sycl::rounding_mode::automatic>()[0];
|
| 14 |
+
*reinterpret_cast<TOut*>(dst) = dst_val;;
|
| 15 |
+
}
|
| 16 |
+
|
| 17 |
+
template<typename TIn, typename TOut>
|
| 18 |
+
static void k_set_rows(
|
| 19 |
+
const char * __restrict__ src0, const int64_t * __restrict__ src1, char * __restrict__ dst,
|
| 20 |
+
const int64_t ne00, const int64_t ne01, const int64_t ne11, const int64_t ne12,
|
| 21 |
+
const size_t nb01, const size_t nb02, const size_t nb03,
|
| 22 |
+
const size_t nb10, const size_t nb11, const size_t nb12,
|
| 23 |
+
const size_t nb1, const size_t nb2, const size_t nb3,
|
| 24 |
+
const size_t src_type_size, const size_t dst_type_size,
|
| 25 |
+
const sycl::nd_item<3> & item_ct1) {
|
| 26 |
+
|
| 27 |
+
const int i03 = item_ct1.get_group(0);
|
| 28 |
+
const int i02 = item_ct1.get_group(1);
|
| 29 |
+
const int i01 = item_ct1.get_group(2) * item_ct1.get_local_range(1) + item_ct1.get_local_id(1); // Row index
|
| 30 |
+
|
| 31 |
+
if (i01 >= ne01) {
|
| 32 |
+
return;
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
const int i12 = i03 % ne12;
|
| 36 |
+
const int i11 = i02 % ne11;
|
| 37 |
+
const int i10 = i01;
|
| 38 |
+
|
| 39 |
+
const int64_t dst_row = *(const int64_t *)((const char *)src1 + calculate_offset<3>({nb10, nb11, nb12}, {i10, i11, i12}));
|
| 40 |
+
|
| 41 |
+
const char * src0_row = src0 + calculate_offset<3>({nb01, nb02, nb03}, {i01, i02, i03});
|
| 42 |
+
char * dst_row_ptr = dst + dst_row*nb1 + i02*nb2 + i03*nb3;
|
| 43 |
+
|
| 44 |
+
for (int col = item_ct1.get_local_id(0); col < ne00; col += item_ct1.get_local_range(0)) {
|
| 45 |
+
const char * src_elem = src0_row + col * src_type_size;
|
| 46 |
+
char * dst_elem = dst_row_ptr + col * dst_type_size;
|
| 47 |
+
convert<TIn, TOut>(src_elem, dst_elem);
|
| 48 |
+
}
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
template<typename TIn, typename TOut>
|
| 52 |
+
static void set_rows_sycl(
|
| 53 |
+
const char * src0_d, const int64_t * src1_d, char * dst_d,
|
| 54 |
+
const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
|
| 55 |
+
const int64_t ne11, const int64_t ne12, const size_t nb01, const size_t nb02, const size_t nb03,
|
| 56 |
+
const size_t nb10, const size_t nb11, const size_t nb12,
|
| 57 |
+
const size_t nb1, const size_t nb2, const size_t nb3,
|
| 58 |
+
const size_t src_type_size, const size_t dst_type_size,
|
| 59 |
+
queue_ptr stream) {
|
| 60 |
+
|
| 61 |
+
constexpr int max_threads_per_row = 64; // KEEPING 64 for now
|
| 62 |
+
const int threads_per_row = std::min((int)ne00, max_threads_per_row);
|
| 63 |
+
|
| 64 |
+
constexpr int max_threads_per_block = 64;
|
| 65 |
+
const int rows_per_block = std::max(1, max_threads_per_block / threads_per_row);
|
| 66 |
+
|
| 67 |
+
const sycl::range<3> block_size(1, rows_per_block, threads_per_row);
|
| 68 |
+
const sycl::range<3> grid_size(ne03, ne02, (ne01 + rows_per_block - 1) / rows_per_block);
|
| 69 |
+
|
| 70 |
+
sycl_parallel_for(
|
| 71 |
+
stream,
|
| 72 |
+
sycl::nd_range<3>(grid_size * block_size, block_size),
|
| 73 |
+
[=](sycl::nd_item<3> item_ct1) {
|
| 74 |
+
k_set_rows<TIn, TOut>(
|
| 75 |
+
src0_d, src1_d, dst_d,
|
| 76 |
+
ne00, ne01, ne11, ne12,
|
| 77 |
+
nb01, nb02, nb03,
|
| 78 |
+
nb10, nb11, nb12,
|
| 79 |
+
nb1, nb2, nb3,
|
| 80 |
+
src_type_size, dst_type_size,
|
| 81 |
+
item_ct1
|
| 82 |
+
);
|
| 83 |
+
}
|
| 84 |
+
);
|
| 85 |
+
}
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
void ggml_sycl_op_set_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
|
| 89 |
+
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
|
| 90 |
+
const ggml_tensor * src0 = dst->src[0];
|
| 91 |
+
const ggml_tensor * src1 = dst->src[1];
|
| 92 |
+
|
| 93 |
+
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
|
| 94 |
+
GGML_ASSERT(dst->src[1]->type == GGML_TYPE_I64);
|
| 95 |
+
|
| 96 |
+
GGML_TENSOR_BINARY_OP_LOCALS
|
| 97 |
+
|
| 98 |
+
const int64_t * src1_dd = static_cast<const int64_t *>(src1->data);
|
| 99 |
+
|
| 100 |
+
dpct::queue_ptr stream = ctx.stream();
|
| 101 |
+
switch (dst->type) {
|
| 102 |
+
case GGML_TYPE_F32:
|
| 103 |
+
set_rows_sycl<float, float>(
|
| 104 |
+
(const char *)src0->data, src1_dd, (char *)dst->data,
|
| 105 |
+
ne00, ne01, ne02, ne03,
|
| 106 |
+
ne11, ne12,
|
| 107 |
+
nb01, nb02, nb03,
|
| 108 |
+
nb10, nb11, nb12,
|
| 109 |
+
nb1, nb2, nb3,
|
| 110 |
+
sizeof(float), sizeof(float),
|
| 111 |
+
stream
|
| 112 |
+
);
|
| 113 |
+
break;
|
| 114 |
+
case GGML_TYPE_F16:
|
| 115 |
+
dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
|
| 116 |
+
set_rows_sycl<float, sycl::half>(
|
| 117 |
+
(const char *)src0->data, src1_dd, (char *)dst->data,
|
| 118 |
+
ne00, ne01, ne02, ne03,
|
| 119 |
+
ne11, ne12,
|
| 120 |
+
nb01, nb02, nb03,
|
| 121 |
+
nb10, nb11, nb12,
|
| 122 |
+
nb1, nb2, nb3,
|
| 123 |
+
sizeof(float), sizeof(sycl::half),
|
| 124 |
+
stream
|
| 125 |
+
);
|
| 126 |
+
break;
|
| 127 |
+
default:
|
| 128 |
+
GGML_ABORT("Unsupported tensor type!");
|
| 129 |
+
break;
|
| 130 |
+
}
|
| 131 |
+
}
|
ggml/src/ggml-sycl/set_rows.hpp
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#ifndef GGML_SYCL_SET_ROWS_HPP
|
| 2 |
+
#define GGML_SYCL_SET_ROWS_HPP
|
| 3 |
+
|
| 4 |
+
#include "common.hpp"
|
| 5 |
+
|
| 6 |
+
void ggml_sycl_op_set_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
|
| 7 |
+
|
| 8 |
+
#endif // GGML_SYCL_SET_ROWS_HPP
|