Akarshan Biswas commited on
Commit
e62ef85
·
1 Parent(s): 92b2d32

SYCL: Initial set_rows kernel implementation (llama/14562)

Browse files

* SYCL: Initial set_rows kernel implementation

* Revert max_threads to 256

* Refactor set_rows and address review comments

* Deduplicate conversion function

* Remove guard before kernel launch and refactor

* Fix and add back SFINAE

ggml/src/ggml-sycl/backend.hpp CHANGED
@@ -30,6 +30,7 @@
30
  #include "outprod.hpp"
31
  #include "quants.hpp"
32
  #include "rope.hpp"
 
33
  #include "softmax.hpp"
34
  #include "tsembd.hpp"
35
  #include "wkv.hpp"
 
30
  #include "outprod.hpp"
31
  #include "quants.hpp"
32
  #include "rope.hpp"
33
+ #include "set_rows.hpp"
34
  #include "softmax.hpp"
35
  #include "tsembd.hpp"
36
  #include "wkv.hpp"
ggml/src/ggml-sycl/ggml-sycl.cpp CHANGED
@@ -41,6 +41,7 @@
41
  #include "ggml-sycl/element_wise.hpp"
42
  #include "ggml-sycl/presets.hpp"
43
  #include "ggml-sycl/gemm.hpp"
 
44
  #include "ggml-sycl/sycl_hw.hpp"
45
  #include "ggml-sycl/getrows.hpp"
46
  #include "ggml.h"
@@ -3605,6 +3606,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
3605
  case GGML_OP_GET_ROWS:
3606
  ggml_sycl_get_rows(ctx, dst);
3607
  break;
 
 
 
3608
  case GGML_OP_DUP:
3609
  ggml_sycl_dup(ctx, dst);
3610
  break;
@@ -4299,7 +4303,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
4299
  {
4300
  // TODO: add support
4301
  // ref: https://github.com/ggml-org/llama.cpp/pull/14274
4302
- return false;
4303
  } break;
4304
  case GGML_OP_CPY:
4305
  {
 
41
  #include "ggml-sycl/element_wise.hpp"
42
  #include "ggml-sycl/presets.hpp"
43
  #include "ggml-sycl/gemm.hpp"
44
+ #include "ggml-sycl/set_rows.hpp"
45
  #include "ggml-sycl/sycl_hw.hpp"
46
  #include "ggml-sycl/getrows.hpp"
47
  #include "ggml.h"
 
3606
  case GGML_OP_GET_ROWS:
3607
  ggml_sycl_get_rows(ctx, dst);
3608
  break;
3609
+ case GGML_OP_SET_ROWS:
3610
+ ggml_sycl_op_set_rows(ctx, dst);
3611
+ break;
3612
  case GGML_OP_DUP:
3613
  ggml_sycl_dup(ctx, dst);
3614
  break;
 
4303
  {
4304
  // TODO: add support
4305
  // ref: https://github.com/ggml-org/llama.cpp/pull/14274
4306
+ return (op->type == GGML_TYPE_F32 || (op->type == GGML_TYPE_F16 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_I64));
4307
  } break;
4308
  case GGML_OP_CPY:
4309
  {
ggml/src/ggml-sycl/set_rows.cpp ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #include "set_rows.hpp"
2
+
3
+ namespace utils {
4
+ template<typename T>
5
+ static constexpr bool is_arithmetic_v() {
6
+ return std::is_arithmetic_v<T> || std::is_same_v<T, sycl::half> || std::is_same_v<T, sycl::ext::oneapi::bfloat16>;
7
+ }
8
+ }
9
+ template<typename TIn, typename TOut>
10
+ static inline std::enable_if_t<utils::is_arithmetic_v<TIn>() && utils::is_arithmetic_v<TOut>(), void>
11
+ convert (const char* src, char* dst) {
12
+ auto src_val = *reinterpret_cast<const TIn*>(src);
13
+ auto dst_val = sycl::vec<TIn, 1>(src_val).template convert<TOut, sycl::rounding_mode::automatic>()[0];
14
+ *reinterpret_cast<TOut*>(dst) = dst_val;;
15
+ }
16
+
17
+ template<typename TIn, typename TOut>
18
+ static void k_set_rows(
19
+ const char * __restrict__ src0, const int64_t * __restrict__ src1, char * __restrict__ dst,
20
+ const int64_t ne00, const int64_t ne01, const int64_t ne11, const int64_t ne12,
21
+ const size_t nb01, const size_t nb02, const size_t nb03,
22
+ const size_t nb10, const size_t nb11, const size_t nb12,
23
+ const size_t nb1, const size_t nb2, const size_t nb3,
24
+ const size_t src_type_size, const size_t dst_type_size,
25
+ const sycl::nd_item<3> & item_ct1) {
26
+
27
+ const int i03 = item_ct1.get_group(0);
28
+ const int i02 = item_ct1.get_group(1);
29
+ const int i01 = item_ct1.get_group(2) * item_ct1.get_local_range(1) + item_ct1.get_local_id(1); // Row index
30
+
31
+ if (i01 >= ne01) {
32
+ return;
33
+ }
34
+
35
+ const int i12 = i03 % ne12;
36
+ const int i11 = i02 % ne11;
37
+ const int i10 = i01;
38
+
39
+ const int64_t dst_row = *(const int64_t *)((const char *)src1 + calculate_offset<3>({nb10, nb11, nb12}, {i10, i11, i12}));
40
+
41
+ const char * src0_row = src0 + calculate_offset<3>({nb01, nb02, nb03}, {i01, i02, i03});
42
+ char * dst_row_ptr = dst + dst_row*nb1 + i02*nb2 + i03*nb3;
43
+
44
+ for (int col = item_ct1.get_local_id(0); col < ne00; col += item_ct1.get_local_range(0)) {
45
+ const char * src_elem = src0_row + col * src_type_size;
46
+ char * dst_elem = dst_row_ptr + col * dst_type_size;
47
+ convert<TIn, TOut>(src_elem, dst_elem);
48
+ }
49
+ }
50
+
51
+ template<typename TIn, typename TOut>
52
+ static void set_rows_sycl(
53
+ const char * src0_d, const int64_t * src1_d, char * dst_d,
54
+ const int64_t ne00, const int64_t ne01, const int64_t ne02, const int64_t ne03,
55
+ const int64_t ne11, const int64_t ne12, const size_t nb01, const size_t nb02, const size_t nb03,
56
+ const size_t nb10, const size_t nb11, const size_t nb12,
57
+ const size_t nb1, const size_t nb2, const size_t nb3,
58
+ const size_t src_type_size, const size_t dst_type_size,
59
+ queue_ptr stream) {
60
+
61
+ constexpr int max_threads_per_row = 64; // KEEPING 64 for now
62
+ const int threads_per_row = std::min((int)ne00, max_threads_per_row);
63
+
64
+ constexpr int max_threads_per_block = 64;
65
+ const int rows_per_block = std::max(1, max_threads_per_block / threads_per_row);
66
+
67
+ const sycl::range<3> block_size(1, rows_per_block, threads_per_row);
68
+ const sycl::range<3> grid_size(ne03, ne02, (ne01 + rows_per_block - 1) / rows_per_block);
69
+
70
+ sycl_parallel_for(
71
+ stream,
72
+ sycl::nd_range<3>(grid_size * block_size, block_size),
73
+ [=](sycl::nd_item<3> item_ct1) {
74
+ k_set_rows<TIn, TOut>(
75
+ src0_d, src1_d, dst_d,
76
+ ne00, ne01, ne11, ne12,
77
+ nb01, nb02, nb03,
78
+ nb10, nb11, nb12,
79
+ nb1, nb2, nb3,
80
+ src_type_size, dst_type_size,
81
+ item_ct1
82
+ );
83
+ }
84
+ );
85
+ }
86
+
87
+
88
+ void ggml_sycl_op_set_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
89
+ scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
90
+ const ggml_tensor * src0 = dst->src[0];
91
+ const ggml_tensor * src1 = dst->src[1];
92
+
93
+ GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
94
+ GGML_ASSERT(dst->src[1]->type == GGML_TYPE_I64);
95
+
96
+ GGML_TENSOR_BINARY_OP_LOCALS
97
+
98
+ const int64_t * src1_dd = static_cast<const int64_t *>(src1->data);
99
+
100
+ dpct::queue_ptr stream = ctx.stream();
101
+ switch (dst->type) {
102
+ case GGML_TYPE_F32:
103
+ set_rows_sycl<float, float>(
104
+ (const char *)src0->data, src1_dd, (char *)dst->data,
105
+ ne00, ne01, ne02, ne03,
106
+ ne11, ne12,
107
+ nb01, nb02, nb03,
108
+ nb10, nb11, nb12,
109
+ nb1, nb2, nb3,
110
+ sizeof(float), sizeof(float),
111
+ stream
112
+ );
113
+ break;
114
+ case GGML_TYPE_F16:
115
+ dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
116
+ set_rows_sycl<float, sycl::half>(
117
+ (const char *)src0->data, src1_dd, (char *)dst->data,
118
+ ne00, ne01, ne02, ne03,
119
+ ne11, ne12,
120
+ nb01, nb02, nb03,
121
+ nb10, nb11, nb12,
122
+ nb1, nb2, nb3,
123
+ sizeof(float), sizeof(sycl::half),
124
+ stream
125
+ );
126
+ break;
127
+ default:
128
+ GGML_ABORT("Unsupported tensor type!");
129
+ break;
130
+ }
131
+ }
ggml/src/ggml-sycl/set_rows.hpp ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ #ifndef GGML_SYCL_SET_ROWS_HPP
2
+ #define GGML_SYCL_SET_ROWS_HPP
3
+
4
+ #include "common.hpp"
5
+
6
+ void ggml_sycl_op_set_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
7
+
8
+ #endif // GGML_SYCL_SET_ROWS_HPP