slpnix commited on
Commit
c8008b8
·
1 Parent(s): 7f24ebb

kompute : improve backend to pass test_backend_ops (llama/10542)

Browse files

* kompute: op_unary: reject unsupported parameters

Signed-off-by: Sergio Lopez <[email protected]>

* kompute: softmax: implement ALiBi support

Signed-off-by: Sergio Lopez <[email protected]>

* kompute: rope: implement neox and phi3 support

Signed-off-by: Sergio Lopez <[email protected]>

* kompute: op_mul_mat_q4_k permutted support

Signed-off-by: Sergio Lopez <[email protected]>

* kompute: op_mul_mat_[q4_0|q4_1|q8_0] permutted support

Signed-off-by: Sergio Lopez <[email protected]>

* kompute: op_mul_mat_f16 permutted support

Signed-off-by: Sergio Lopez <[email protected]>

* kompute: op_mul_mat_q6_k permutted support

Signed-off-by: Sergio Lopez <[email protected]>

---------

Signed-off-by: Sergio Lopez <[email protected]>

ggml/src/ggml-kompute/CMakeLists.txt CHANGED
@@ -105,8 +105,10 @@ if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/kompute/CMakeLists.txt")
105
  kompute-shaders/op_getrows_q4_0.comp
106
  kompute-shaders/op_getrows_q4_1.comp
107
  kompute-shaders/op_getrows_q6_k.comp
108
- kompute-shaders/op_rope_f16.comp
109
- kompute-shaders/op_rope_f32.comp
 
 
110
  kompute-shaders/op_cpy_f16_f16.comp
111
  kompute-shaders/op_cpy_f16_f32.comp
112
  kompute-shaders/op_cpy_f32_f16.comp
@@ -139,8 +141,10 @@ if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/kompute/CMakeLists.txt")
139
  shaderop_getrows_q4_0.h
140
  shaderop_getrows_q4_1.h
141
  shaderop_getrows_q6_k.h
142
- shaderop_rope_f16.h
143
- shaderop_rope_f32.h
 
 
144
  shaderop_cpy_f16_f16.h
145
  shaderop_cpy_f16_f32.h
146
  shaderop_cpy_f32_f16.h
 
105
  kompute-shaders/op_getrows_q4_0.comp
106
  kompute-shaders/op_getrows_q4_1.comp
107
  kompute-shaders/op_getrows_q6_k.comp
108
+ kompute-shaders/op_rope_norm_f16.comp
109
+ kompute-shaders/op_rope_norm_f32.comp
110
+ kompute-shaders/op_rope_neox_f16.comp
111
+ kompute-shaders/op_rope_neox_f32.comp
112
  kompute-shaders/op_cpy_f16_f16.comp
113
  kompute-shaders/op_cpy_f16_f32.comp
114
  kompute-shaders/op_cpy_f32_f16.comp
 
141
  shaderop_getrows_q4_0.h
142
  shaderop_getrows_q4_1.h
143
  shaderop_getrows_q6_k.h
144
+ shaderop_rope_norm_f16.h
145
+ shaderop_rope_norm_f32.h
146
+ shaderop_rope_neox_f16.h
147
+ shaderop_rope_neox_f32.h
148
  shaderop_cpy_f16_f16.h
149
  shaderop_cpy_f16_f32.h
150
  shaderop_cpy_f32_f16.h
ggml/src/ggml-kompute/ggml-kompute.cpp CHANGED
@@ -28,8 +28,10 @@
28
  #include "shaderop_getrows_q4_0.h"
29
  #include "shaderop_getrows_q4_1.h"
30
  #include "shaderop_getrows_q6_k.h"
31
- #include "shaderop_rope_f16.h"
32
- #include "shaderop_rope_f32.h"
 
 
33
  #include "shaderop_cpy_f16_f16.h"
34
  #include "shaderop_cpy_f16_f32.h"
35
  #include "shaderop_cpy_f32_f16.h"
@@ -345,7 +347,7 @@ void ggml_vk_allocate_descriptor_pool(struct ggml_kompute_context * ctx, size_t
345
  std::vector<vk::DescriptorPoolSize> descriptorPoolSizes = {
346
  vk::DescriptorPoolSize(
347
  vk::DescriptorType::eStorageBuffer,
348
- 3 * size // Descriptor count is number of possible tensors to pass into an algorithm
349
  )
350
  };
351
 
@@ -788,7 +790,8 @@ static void ggml_vk_soft_max(
788
  const std::shared_ptr<kp::Tensor>& out,
789
  uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
790
  int32_t ne00, int32_t ne01, int32_t ne02, uint32_t ne03,
791
- float scale
 
792
  ) {
793
  const static auto spirv = getSpirvShader(kp::shader_data::op_softmax_comp_spv,
794
  kp::shader_data::op_softmax_comp_spv_len);
@@ -796,12 +799,14 @@ static void ggml_vk_soft_max(
796
  struct PushConstants {
797
  uint32_t inAOff, inBOff, outOff;
798
  int32_t ne00, ne01, ne02;
799
- float scale;
 
800
  int32_t mask;
801
  } pushConsts {
802
  safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4),
803
  ne00, ne01, ne02,
804
- scale,
 
805
  bool(inB)
806
  };
807
 
@@ -911,9 +916,9 @@ static void ggml_vk_mul_mat_f16(
911
  const std::shared_ptr<kp::Tensor>& out,
912
  uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
913
  int32_t ne00, int32_t ne01, int32_t ne02,
914
- uint32_t nb00, uint32_t nb01, uint32_t nb02,
915
  int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13,
916
- uint32_t nb10, uint32_t nb11, uint32_t nb12,
917
  int32_t ne0, int32_t ne1,
918
  uint32_t r2, uint32_t r3
919
  ) {
@@ -923,17 +928,17 @@ static void ggml_vk_mul_mat_f16(
923
  struct PushConstants {
924
  uint32_t inAOff, inBOff, outOff;
925
  int32_t ne00, ne01, ne02;
926
- uint32_t nb00, nb01, nb02;
927
  int32_t ne10, ne11, ne12;
928
- uint32_t nb10, nb11, nb12;
929
  int32_t ne0, ne1;
930
  uint32_t r2, r3;
931
  } pushConsts {
932
  safe_divide(inAOff, 2), safe_divide(inBOff, 4), safe_divide(outOff, 4),
933
  ne00, ne01, ne02,
934
- nb00, nb01, nb02,
935
  ne10, ne11, ne12,
936
- nb10, nb11, nb12,
937
  ne0, ne1,
938
  r2, r3
939
  };
@@ -1013,6 +1018,8 @@ static void ggml_vk_mul_mat_impl(
1013
  int32_t ne00, int32_t ne01, int32_t ne02,
1014
  int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13,
1015
  int32_t ne0, int32_t ne1,
 
 
1016
  uint32_t r2, uint32_t r3
1017
  ) {
1018
  struct PushConstants {
@@ -1020,19 +1027,23 @@ static void ggml_vk_mul_mat_impl(
1020
  int32_t ne00, ne01, ne02;
1021
  int32_t ne10, ne12;
1022
  int32_t ne0, ne1;
 
 
1023
  uint32_t r2, r3;
1024
  } pushConsts {
1025
  safe_divide(inAOff, block_size), safe_divide(inBOff, 4), safe_divide(outOff, 4),
1026
  ne00, ne01, ne02,
1027
  ne10, ne12,
1028
  ne0, ne1,
 
 
1029
  r2, r3
1030
  };
1031
 
1032
  auto name = std::string(__func__) + "_" + suffix;
1033
  std::shared_ptr<kp::Algorithm> s_algo = nullptr;
1034
  if (!komputeManager()->hasAlgorithm(name)) {
1035
- const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2;
1036
  s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(name, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 7)/8), unsigned(ne11), unsigned(ne12*ne13)}, {local_x}, {pushConsts});
1037
  } else {
1038
  s_algo = komputeManager()->getAlgorithm(name);
@@ -1074,19 +1085,26 @@ static void ggml_vk_mul_mat_q4_k(
1074
  const std::shared_ptr<kp::Tensor>& inB,
1075
  const std::shared_ptr<kp::Tensor>& out,
1076
  uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
1077
- int32_t ne00, int32_t ne01, int32_t ne02, int32_t ne10,
1078
- int32_t ne11, int32_t ne12, int32_t ne13, int32_t ne0,
1079
- int32_t ne1, int32_t r2, int32_t r3
 
 
 
1080
  ) {
1081
  const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q4_k_comp_spv,
1082
  kp::shader_data::op_mul_mat_q4_k_comp_spv_len);
1083
 
1084
  struct PushConstants {
1085
  uint32_t inAOff, inBOff, outOff;
1086
- int32_t ne00, ne10, ne0, ne1, ne01, ne02, ne12, r2, r3;
 
 
1087
  } pushConsts {
1088
- 0, 0, 0,
1089
- ne00, ne10, ne0, ne1, ne01, ne02, ne12, r2, r3
 
 
1090
  };
1091
 
1092
  std::shared_ptr<kp::Algorithm> s_algo = nullptr;
@@ -1108,28 +1126,37 @@ static void ggml_vk_mul_mat_q6_k(
1108
  const std::shared_ptr<kp::Tensor>& inB,
1109
  const std::shared_ptr<kp::Tensor>& out,
1110
  uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
1111
- int32_t ne00, int32_t ne10, int32_t ne0, int32_t ne1,
1112
- int32_t ne01, int32_t ne11, int32_t ne12, int32_t ne02
 
 
 
 
1113
  ) {
1114
  const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q6_k_comp_spv,
1115
  kp::shader_data::op_mul_mat_q6_k_comp_spv_len);
1116
 
1117
  struct PushConstants {
1118
  uint32_t inAOff, inBOff, outOff;
1119
- int32_t ne00, ne10, ne0, ne1, ne01, gqa;
 
 
1120
  } pushConsts {
1121
  inAOff, safe_divide(inBOff, 4), safe_divide(outOff, 4),
1122
- ne00, ne10, ne0, ne1, ne01, ne12/ne02
 
 
1123
  };
1124
 
1125
  std::shared_ptr<kp::Algorithm> s_algo = nullptr;
1126
  if (!komputeManager()->hasAlgorithm(__func__)) {
1127
- const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2;
1128
- s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)}, {local_x}, {pushConsts});
 
1129
  } else {
1130
  s_algo = komputeManager()->getAlgorithm(__func__);
1131
  s_algo->setTensors({inA, inB, out});
1132
- s_algo->setWorkgroup({unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)});
1133
  s_algo->setPushConstants<PushConstants>({pushConsts});
1134
  s_algo->updateDescriptors(s_kompute_context->pool.get());
1135
  }
@@ -1217,10 +1244,11 @@ static void ggml_vk_rope(
1217
  kp::Sequence& seq,
1218
  const std::shared_ptr<kp::Tensor>& inA,
1219
  const std::shared_ptr<kp::Tensor>& inB,
 
1220
  const std::shared_ptr<kp::Tensor>& out,
1221
- uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
1222
  ggml_type src0t, int32_t n_dims, int32_t mode, int32_t n_ctx_orig,
1223
- float freq_base, float freq_scale, float ext_factor, float attn_factor, float beta_fast, float beta_slow,
1224
  int32_t ne01, int32_t ne02, int32_t ne03,
1225
  uint32_t nb00, uint32_t nb01, uint32_t nb02, uint32_t nb03,
1226
  int32_t ne0,
@@ -1228,11 +1256,17 @@ static void ggml_vk_rope(
1228
  ) {
1229
  GGML_ASSERT(src0t == GGML_TYPE_F16 || src0t == GGML_TYPE_F32);
1230
 
1231
- static const auto spirv_f16 = getSpirvShader(
1232
- kp::shader_data::op_rope_f16_comp_spv, kp::shader_data::op_rope_f16_comp_spv_len
 
 
 
 
 
 
1233
  );
1234
- static const auto spirv_f32 = getSpirvShader(
1235
- kp::shader_data::op_rope_f32_comp_spv, kp::shader_data::op_rope_f32_comp_spv_len
1236
  );
1237
 
1238
  int type_size = src0t == GGML_TYPE_F16 ? 2 : 4;
@@ -1247,32 +1281,40 @@ static void ggml_vk_rope(
1247
  GGML_ASSERT(nb0 % type_size == 0);
1248
 
1249
  struct PushConstants {
1250
- uint32_t inAOff, inBOff, outOff;
1251
  int32_t n_dims, mode, n_ctx_orig;
1252
- float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
 
 
1253
  uint32_t nb00, nb01, nb02, nb03;
1254
  int32_t ne0;
1255
  uint32_t nb0, nb1, nb2, nb3;
1256
  } pushConsts {
1257
- safe_divide(inAOff, type_size), safe_divide(inBOff, 4), safe_divide(outOff, type_size),
1258
  n_dims, mode, n_ctx_orig,
1259
- freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow,
 
 
1260
  nb00, nb01, nb02, nb03,
1261
  ne0,
1262
  nb0, nb1, nb2, nb3
1263
  };
1264
 
1265
- auto name = std::string(__func__) + (src0t == GGML_TYPE_F16 ? "_f16" : "_f32");
 
 
 
 
1266
  std::shared_ptr<kp::Algorithm> s_algo = nullptr;
1267
  if (!komputeManager()->hasAlgorithm(name)) {
 
1268
  s_algo = komputeManager()->algorithm<float, PushConstants>(
1269
- name, s_kompute_context->pool.get(), {inA, inB, out},
1270
- src0t == GGML_TYPE_F16 ? spirv_f16 : spirv_f32,
1271
  {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts}
1272
  );
1273
  } else {
1274
  s_algo = komputeManager()->getAlgorithm(name);
1275
- s_algo->setTensors({inA, inB, out});
1276
  s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
1277
  s_algo->setPushConstants<PushConstants>({pushConsts});
1278
  s_algo->updateDescriptors(s_kompute_context->pool.get());
@@ -1351,11 +1393,15 @@ static void ggml_vk_cpy_f16_f32(Args&&... args) {
1351
  }
1352
 
1353
  static bool ggml_backend_kompute_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
 
1354
  switch (op->op) {
1355
  case GGML_OP_UNARY:
 
1356
  switch (ggml_get_unary_op(op)) {
1357
- case GGML_UNARY_OP_RELU:
1358
  case GGML_UNARY_OP_GELU:
 
 
 
1359
  case GGML_UNARY_OP_SILU:
1360
  return ggml_is_contiguous(op->src[0]);
1361
  default:
@@ -1413,8 +1459,8 @@ static bool ggml_backend_kompute_device_supports_op(ggml_backend_dev_t dev, cons
1413
 
1414
  switch (op->src[0]->type) {
1415
  case GGML_TYPE_F32:
1416
- case GGML_TYPE_Q6_K:
1417
  return op->ne[3] == 1;
 
1418
  case GGML_TYPE_F16:
1419
  case GGML_TYPE_Q8_0:
1420
  case GGML_TYPE_Q4_0:
@@ -1515,9 +1561,11 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
1515
  const static std::shared_ptr<kp::Tensor> nullTensor = nullptr;
1516
  uint32_t off_src0 = 0;
1517
  uint32_t off_src1 = 0;
 
1518
  uint32_t off_dst = 0;
1519
  const std::shared_ptr<kp::Tensor>& id_src0 = src0 ? ggml_vk_get_tensor(src0, &off_src0) : nullTensor;
1520
  const std::shared_ptr<kp::Tensor>& id_src1 = src1 ? ggml_vk_get_tensor(src1, &off_src1) : nullTensor;
 
1521
  const std::shared_ptr<kp::Tensor>& id_dst = dst ? ggml_vk_get_tensor(dst, &off_dst) : nullTensor;
1522
 
1523
  switch (dst->op) {
@@ -1593,11 +1641,16 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
1593
  #pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5021")
1594
  GGML_ASSERT(!src1 || src1t == GGML_TYPE_F32);
1595
 
1596
- #pragma message("TODO: add ALiBi support")
1597
- #pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/7192")
1598
- GGML_ASSERT(max_bias == 0.0f);
 
 
1599
 
1600
- ggml_vk_soft_max(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, ne02, ne03, scale);
 
 
 
1601
  } break;
1602
  case GGML_OP_DIAG_MASK_INF:
1603
  {
@@ -1649,38 +1702,44 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
1649
  case GGML_TYPE_F16:
1650
  ggml_vk_mul_mat_f16(
1651
  seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
1652
- ne00, ne01, ne02, nb00, nb01, nb02, ne10, ne11, ne12, ne13, nb10, nb11, nb12,
 
1653
  ne0, ne1, r2, r3
1654
  );
1655
  break;
1656
  case GGML_TYPE_Q8_0:
1657
  ggml_vk_mul_mat_q8_0(
1658
  seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
1659
- ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1, r2, r3
 
1660
  );
1661
  break;
1662
  case GGML_TYPE_Q4_0:
1663
  ggml_vk_mul_mat_q4_0(
1664
  seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
1665
- ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1, r2, r3
 
1666
  );
1667
  break;
1668
  case GGML_TYPE_Q4_1:
1669
  ggml_vk_mul_mat_q4_1(
1670
  seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
1671
- ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1, r2, r3
 
1672
  );
1673
  break;
1674
  case GGML_TYPE_Q4_K:
1675
  ggml_vk_mul_mat_q4_k(
1676
  seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
1677
- ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1, ne12/ne02, ne13/ne03
 
1678
  );
1679
  break;
1680
  case GGML_TYPE_Q6_K:
1681
  ggml_vk_mul_mat_q6_k(
1682
  seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
1683
- ne00, ne10, ne0, ne1, ne01, ne11, ne12, ne02
 
1684
  );
1685
  break;
1686
  default: {
@@ -1709,13 +1768,6 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
1709
  } break;
1710
  case GGML_OP_ROPE:
1711
  {
1712
- #pragma message("TODO: implement phi3 frequency factors support")
1713
- #pragma message(" https://github.com/ggerganov/llama.cpp/pull/7225")
1714
- GGML_ASSERT(dst->src[2] == nullptr && "phi3 frequency factors not implemented yet");
1715
-
1716
- #pragma message("TODO: update rope NORM mode to match NEOX mode")
1717
- #pragma message(" https://github.com/ggerganov/llama.cpp/pull/7634")
1718
-
1719
  GGML_ASSERT(ne10 == ne02);
1720
  GGML_ASSERT(src0t == dstt);
1721
  // const int n_past = ((int32_t *) dst->op_params)[0];
@@ -1724,6 +1776,8 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
1724
  // skip 3, n_ctx used in GLM RoPE, unimplemented in Vulkan
1725
  const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
1726
 
 
 
1727
  float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
1728
  memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
1729
  memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
@@ -1732,8 +1786,8 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
1732
  memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
1733
  memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
1734
  ggml_vk_rope(
1735
- seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, src0t, n_dims, mode, n_ctx_orig,
1736
- freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow,
1737
  ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, nb0, nb1, nb2, nb3
1738
  );
1739
  } break;
 
28
  #include "shaderop_getrows_q4_0.h"
29
  #include "shaderop_getrows_q4_1.h"
30
  #include "shaderop_getrows_q6_k.h"
31
+ #include "shaderop_rope_norm_f16.h"
32
+ #include "shaderop_rope_norm_f32.h"
33
+ #include "shaderop_rope_neox_f16.h"
34
+ #include "shaderop_rope_neox_f32.h"
35
  #include "shaderop_cpy_f16_f16.h"
36
  #include "shaderop_cpy_f16_f32.h"
37
  #include "shaderop_cpy_f32_f16.h"
 
347
  std::vector<vk::DescriptorPoolSize> descriptorPoolSizes = {
348
  vk::DescriptorPoolSize(
349
  vk::DescriptorType::eStorageBuffer,
350
+ 4 * size // Descriptor count is number of possible tensors to pass into an algorithm
351
  )
352
  };
353
 
 
790
  const std::shared_ptr<kp::Tensor>& out,
791
  uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
792
  int32_t ne00, int32_t ne01, int32_t ne02, uint32_t ne03,
793
+ float scale, float max_bias, float m0, float m1,
794
+ uint32_t n_head_log2
795
  ) {
796
  const static auto spirv = getSpirvShader(kp::shader_data::op_softmax_comp_spv,
797
  kp::shader_data::op_softmax_comp_spv_len);
 
799
  struct PushConstants {
800
  uint32_t inAOff, inBOff, outOff;
801
  int32_t ne00, ne01, ne02;
802
+ float scale, max_bias, m0, m1;
803
+ uint32_t n_head_log2;
804
  int32_t mask;
805
  } pushConsts {
806
  safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4),
807
  ne00, ne01, ne02,
808
+ scale, max_bias, m0, m1,
809
+ n_head_log2,
810
  bool(inB)
811
  };
812
 
 
916
  const std::shared_ptr<kp::Tensor>& out,
917
  uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
918
  int32_t ne00, int32_t ne01, int32_t ne02,
919
+ uint32_t nb00, uint32_t nb01, uint32_t nb02, uint32_t nb03,
920
  int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13,
921
+ uint32_t nb10, uint32_t nb11, uint32_t nb12, uint32_t nb13,
922
  int32_t ne0, int32_t ne1,
923
  uint32_t r2, uint32_t r3
924
  ) {
 
928
  struct PushConstants {
929
  uint32_t inAOff, inBOff, outOff;
930
  int32_t ne00, ne01, ne02;
931
+ uint32_t nb00, nb01, nb02, nb03;
932
  int32_t ne10, ne11, ne12;
933
+ uint32_t nb10, nb11, nb12, nb13;
934
  int32_t ne0, ne1;
935
  uint32_t r2, r3;
936
  } pushConsts {
937
  safe_divide(inAOff, 2), safe_divide(inBOff, 4), safe_divide(outOff, 4),
938
  ne00, ne01, ne02,
939
+ nb00, nb01, nb02, nb03,
940
  ne10, ne11, ne12,
941
+ nb10, nb11, nb12, nb13,
942
  ne0, ne1,
943
  r2, r3
944
  };
 
1018
  int32_t ne00, int32_t ne01, int32_t ne02,
1019
  int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13,
1020
  int32_t ne0, int32_t ne1,
1021
+ uint32_t nb01, uint32_t nb02, uint32_t nb03,
1022
+ uint32_t nb11, uint32_t nb12, uint32_t nb13,
1023
  uint32_t r2, uint32_t r3
1024
  ) {
1025
  struct PushConstants {
 
1027
  int32_t ne00, ne01, ne02;
1028
  int32_t ne10, ne12;
1029
  int32_t ne0, ne1;
1030
+ uint32_t nb01, nb02, nb03;
1031
+ uint32_t nb11, nb12, nb13;
1032
  uint32_t r2, r3;
1033
  } pushConsts {
1034
  safe_divide(inAOff, block_size), safe_divide(inBOff, 4), safe_divide(outOff, 4),
1035
  ne00, ne01, ne02,
1036
  ne10, ne12,
1037
  ne0, ne1,
1038
+ nb01, nb02, nb03,
1039
+ nb11, nb12, nb13,
1040
  r2, r3
1041
  };
1042
 
1043
  auto name = std::string(__func__) + "_" + suffix;
1044
  std::shared_ptr<kp::Algorithm> s_algo = nullptr;
1045
  if (!komputeManager()->hasAlgorithm(name)) {
1046
+ const uint32_t local_x = (ggml_vk_current_device().subgroupSize * 2) / 8;
1047
  s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(name, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 7)/8), unsigned(ne11), unsigned(ne12*ne13)}, {local_x}, {pushConsts});
1048
  } else {
1049
  s_algo = komputeManager()->getAlgorithm(name);
 
1085
  const std::shared_ptr<kp::Tensor>& inB,
1086
  const std::shared_ptr<kp::Tensor>& out,
1087
  uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
1088
+ int32_t ne00, int32_t ne01, int32_t ne02,
1089
+ int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13,
1090
+ int32_t ne0, int32_t ne1,
1091
+ uint32_t nb01, uint32_t nb02, uint32_t nb03,
1092
+ uint32_t nb11, uint32_t nb12, uint32_t nb13,
1093
+ uint32_t r2, uint32_t r3
1094
  ) {
1095
  const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q4_k_comp_spv,
1096
  kp::shader_data::op_mul_mat_q4_k_comp_spv_len);
1097
 
1098
  struct PushConstants {
1099
  uint32_t inAOff, inBOff, outOff;
1100
+ int32_t ne00, ne10, ne0, ne1, ne01, ne02, ne12;
1101
+ uint32_t nb01, nb02, nb03, nb11, nb12, nb13;
1102
+ uint32_t r2, r3;
1103
  } pushConsts {
1104
+ inAOff, safe_divide(inBOff, 4), safe_divide(outOff, 4),
1105
+ ne00, ne10, ne0, ne1, ne01, ne02, ne12,
1106
+ nb01, nb02, nb03, nb11, nb12, nb13,
1107
+ r2, r3
1108
  };
1109
 
1110
  std::shared_ptr<kp::Algorithm> s_algo = nullptr;
 
1126
  const std::shared_ptr<kp::Tensor>& inB,
1127
  const std::shared_ptr<kp::Tensor>& out,
1128
  uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
1129
+ int32_t ne00, int32_t ne01, int32_t ne02,
1130
+ int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13,
1131
+ int32_t ne0, int32_t ne1,
1132
+ uint32_t nb01, uint32_t nb02, uint32_t nb03,
1133
+ uint32_t nb11, uint32_t nb12, uint32_t nb13,
1134
+ uint32_t r2, uint32_t r3
1135
  ) {
1136
  const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q6_k_comp_spv,
1137
  kp::shader_data::op_mul_mat_q6_k_comp_spv_len);
1138
 
1139
  struct PushConstants {
1140
  uint32_t inAOff, inBOff, outOff;
1141
+ int32_t ne00, ne10, ne0, ne1, ne01, ne02, ne12;
1142
+ uint32_t nb01, nb02, nb03, nb11, nb12, nb13;
1143
+ uint32_t r2, r3;
1144
  } pushConsts {
1145
  inAOff, safe_divide(inBOff, 4), safe_divide(outOff, 4),
1146
+ ne00, ne10, ne0, ne1, ne01, ne02, ne12,
1147
+ nb01, nb02, nb03, nb11, nb12, nb13,
1148
+ r2, r3
1149
  };
1150
 
1151
  std::shared_ptr<kp::Algorithm> s_algo = nullptr;
1152
  if (!komputeManager()->hasAlgorithm(__func__)) {
1153
+ const uint32_t local_x = 2;
1154
+ const uint32_t local_y = ggml_vk_current_device().subgroupSize;
1155
+ s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)*unsigned(ne13)}, {local_x, local_y}, {pushConsts});
1156
  } else {
1157
  s_algo = komputeManager()->getAlgorithm(__func__);
1158
  s_algo->setTensors({inA, inB, out});
1159
+ s_algo->setWorkgroup({unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)*unsigned(ne13)});
1160
  s_algo->setPushConstants<PushConstants>({pushConsts});
1161
  s_algo->updateDescriptors(s_kompute_context->pool.get());
1162
  }
 
1244
  kp::Sequence& seq,
1245
  const std::shared_ptr<kp::Tensor>& inA,
1246
  const std::shared_ptr<kp::Tensor>& inB,
1247
+ const std::shared_ptr<kp::Tensor>& inC,
1248
  const std::shared_ptr<kp::Tensor>& out,
1249
+ uint32_t inAOff, uint32_t inBOff, uint32_t inCOff, uint32_t outOff,
1250
  ggml_type src0t, int32_t n_dims, int32_t mode, int32_t n_ctx_orig,
1251
+ float freq_base, float freq_scale, bool has_freq_factors, float ext_factor, float attn_factor, float beta_fast, float beta_slow,
1252
  int32_t ne01, int32_t ne02, int32_t ne03,
1253
  uint32_t nb00, uint32_t nb01, uint32_t nb02, uint32_t nb03,
1254
  int32_t ne0,
 
1256
  ) {
1257
  GGML_ASSERT(src0t == GGML_TYPE_F16 || src0t == GGML_TYPE_F32);
1258
 
1259
+ static const auto spirv_norm_f16 = getSpirvShader(
1260
+ kp::shader_data::op_rope_norm_f16_comp_spv, kp::shader_data::op_rope_norm_f16_comp_spv_len
1261
+ );
1262
+ static const auto spirv_norm_f32 = getSpirvShader(
1263
+ kp::shader_data::op_rope_norm_f32_comp_spv, kp::shader_data::op_rope_norm_f32_comp_spv_len
1264
+ );
1265
+ static const auto spirv_neox_f16 = getSpirvShader(
1266
+ kp::shader_data::op_rope_neox_f16_comp_spv, kp::shader_data::op_rope_neox_f16_comp_spv_len
1267
  );
1268
+ static const auto spirv_neox_f32 = getSpirvShader(
1269
+ kp::shader_data::op_rope_neox_f32_comp_spv, kp::shader_data::op_rope_neox_f32_comp_spv_len
1270
  );
1271
 
1272
  int type_size = src0t == GGML_TYPE_F16 ? 2 : 4;
 
1281
  GGML_ASSERT(nb0 % type_size == 0);
1282
 
1283
  struct PushConstants {
1284
+ uint32_t inAOff, inBOff, inCOff, outOff;
1285
  int32_t n_dims, mode, n_ctx_orig;
1286
+ float freq_base, freq_scale;
1287
+ bool has_freq_factors;
1288
+ float ext_factor, attn_factor, beta_fast, beta_slow;
1289
  uint32_t nb00, nb01, nb02, nb03;
1290
  int32_t ne0;
1291
  uint32_t nb0, nb1, nb2, nb3;
1292
  } pushConsts {
1293
+ safe_divide(inAOff, type_size), safe_divide(inBOff, 4), safe_divide(inCOff, type_size), safe_divide(outOff, type_size),
1294
  n_dims, mode, n_ctx_orig,
1295
+ freq_base, freq_scale,
1296
+ has_freq_factors,
1297
+ ext_factor, attn_factor, beta_fast, beta_slow,
1298
  nb00, nb01, nb02, nb03,
1299
  ne0,
1300
  nb0, nb1, nb2, nb3
1301
  };
1302
 
1303
+ auto & inC_ = inC ? inC : inA;
1304
+ const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
1305
+ const bool is_f16 = src0t == GGML_TYPE_F16;
1306
+
1307
+ auto name = std::string(__func__) + (is_neox ? "_neox" : "_norm") + (src0t == GGML_TYPE_F16 ? "_f16" : "_f32");
1308
  std::shared_ptr<kp::Algorithm> s_algo = nullptr;
1309
  if (!komputeManager()->hasAlgorithm(name)) {
1310
+ auto & spirv = is_neox ? is_f16 ? spirv_neox_f16 : spirv_neox_f32 : is_f16 ? spirv_norm_f16 : spirv_norm_f32;
1311
  s_algo = komputeManager()->algorithm<float, PushConstants>(
1312
+ name, s_kompute_context->pool.get(), {inA, inB, inC_, out}, spirv,
 
1313
  {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts}
1314
  );
1315
  } else {
1316
  s_algo = komputeManager()->getAlgorithm(name);
1317
+ s_algo->setTensors({inA, inB, inC_, out});
1318
  s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
1319
  s_algo->setPushConstants<PushConstants>({pushConsts});
1320
  s_algo->updateDescriptors(s_kompute_context->pool.get());
 
1393
  }
1394
 
1395
  static bool ggml_backend_kompute_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
1396
+ int64_t n = ggml_nelements(op);
1397
  switch (op->op) {
1398
  case GGML_OP_UNARY:
1399
+ if (n % 4 != 0) return false;
1400
  switch (ggml_get_unary_op(op)) {
 
1401
  case GGML_UNARY_OP_GELU:
1402
+ if (n % 8 != 0) return false;
1403
+ // fall through
1404
+ case GGML_UNARY_OP_RELU:
1405
  case GGML_UNARY_OP_SILU:
1406
  return ggml_is_contiguous(op->src[0]);
1407
  default:
 
1459
 
1460
  switch (op->src[0]->type) {
1461
  case GGML_TYPE_F32:
 
1462
  return op->ne[3] == 1;
1463
+ case GGML_TYPE_Q6_K:
1464
  case GGML_TYPE_F16:
1465
  case GGML_TYPE_Q8_0:
1466
  case GGML_TYPE_Q4_0:
 
1561
  const static std::shared_ptr<kp::Tensor> nullTensor = nullptr;
1562
  uint32_t off_src0 = 0;
1563
  uint32_t off_src1 = 0;
1564
+ uint32_t off_src2 = 0;
1565
  uint32_t off_dst = 0;
1566
  const std::shared_ptr<kp::Tensor>& id_src0 = src0 ? ggml_vk_get_tensor(src0, &off_src0) : nullTensor;
1567
  const std::shared_ptr<kp::Tensor>& id_src1 = src1 ? ggml_vk_get_tensor(src1, &off_src1) : nullTensor;
1568
+ const std::shared_ptr<kp::Tensor>& id_src2 = src2 ? ggml_vk_get_tensor(src2, &off_src2) : nullTensor;
1569
  const std::shared_ptr<kp::Tensor>& id_dst = dst ? ggml_vk_get_tensor(dst, &off_dst) : nullTensor;
1570
 
1571
  switch (dst->op) {
 
1641
  #pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5021")
1642
  GGML_ASSERT(!src1 || src1t == GGML_TYPE_F32);
1643
 
1644
+ const int64_t nrows_x = ggml_nrows(src0);
1645
+ const int64_t nrows_y = src0->ne[1];
1646
+
1647
+ const uint32_t n_head = nrows_x/nrows_y;
1648
+ const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
1649
 
1650
+ const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
1651
+ const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
1652
+
1653
+ ggml_vk_soft_max(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, ne02, ne03, scale, max_bias, m0, m1, n_head_log2);
1654
  } break;
1655
  case GGML_OP_DIAG_MASK_INF:
1656
  {
 
1702
  case GGML_TYPE_F16:
1703
  ggml_vk_mul_mat_f16(
1704
  seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
1705
+ ne00, ne01, ne02, nb00, nb01, nb02, nb03,
1706
+ ne10, ne11, ne12, ne13, nb10, nb11, nb12, nb13,
1707
  ne0, ne1, r2, r3
1708
  );
1709
  break;
1710
  case GGML_TYPE_Q8_0:
1711
  ggml_vk_mul_mat_q8_0(
1712
  seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
1713
+ ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1,
1714
+ nb01, nb02, nb03, nb11, nb12, nb13, r2, r3
1715
  );
1716
  break;
1717
  case GGML_TYPE_Q4_0:
1718
  ggml_vk_mul_mat_q4_0(
1719
  seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
1720
+ ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1,
1721
+ nb01, nb02, nb03, nb11, nb12, nb13, r2, r3
1722
  );
1723
  break;
1724
  case GGML_TYPE_Q4_1:
1725
  ggml_vk_mul_mat_q4_1(
1726
  seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
1727
+ ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1,
1728
+ nb01, nb02, nb03, nb11, nb12, nb13, r2, r3
1729
  );
1730
  break;
1731
  case GGML_TYPE_Q4_K:
1732
  ggml_vk_mul_mat_q4_k(
1733
  seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
1734
+ ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1,
1735
+ nb01, nb02, nb03, nb11, nb12, nb13, r2, r3
1736
  );
1737
  break;
1738
  case GGML_TYPE_Q6_K:
1739
  ggml_vk_mul_mat_q6_k(
1740
  seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
1741
+ ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1,
1742
+ nb01, nb02, nb03, nb11, nb12, nb13, r2, r3
1743
  );
1744
  break;
1745
  default: {
 
1768
  } break;
1769
  case GGML_OP_ROPE:
1770
  {
 
 
 
 
 
 
 
1771
  GGML_ASSERT(ne10 == ne02);
1772
  GGML_ASSERT(src0t == dstt);
1773
  // const int n_past = ((int32_t *) dst->op_params)[0];
 
1776
  // skip 3, n_ctx used in GLM RoPE, unimplemented in Vulkan
1777
  const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
1778
 
1779
+ const bool has_freq_factors = dst->src[2] != nullptr;
1780
+
1781
  float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
1782
  memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
1783
  memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
 
1786
  memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
1787
  memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
1788
  ggml_vk_rope(
1789
+ seq, id_src0, id_src1, id_src2, id_dst, off_src0, off_src1, off_src2, off_dst, src0t, n_dims, mode, n_ctx_orig,
1790
+ freq_base, freq_scale, has_freq_factors, ext_factor, attn_factor, beta_fast, beta_slow,
1791
  ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, nb0, nb1, nb2, nb3
1792
  );
1793
  } break;
ggml/src/ggml-kompute/kompute-shaders/common.comp CHANGED
@@ -3,6 +3,7 @@
3
  #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
4
  #extension GL_EXT_shader_explicit_arithmetic_types_int8: require
5
  #extension GL_EXT_shader_explicit_arithmetic_types_int16: require
 
6
  #extension GL_EXT_control_flow_attributes: enable
7
  #extension GL_KHR_shader_subgroup_arithmetic : require
8
  #extension GL_EXT_debug_printf : enable
 
3
  #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
4
  #extension GL_EXT_shader_explicit_arithmetic_types_int8: require
5
  #extension GL_EXT_shader_explicit_arithmetic_types_int16: require
6
+ #extension GL_EXT_shader_explicit_arithmetic_types_int64: require
7
  #extension GL_EXT_control_flow_attributes: enable
8
  #extension GL_KHR_shader_subgroup_arithmetic : require
9
  #extension GL_EXT_debug_printf : enable
ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp CHANGED
@@ -20,12 +20,14 @@ layout (push_constant) uniform parameter {
20
  uint nb00;
21
  uint nb01;
22
  uint nb02;
 
23
  int ne10;
24
  int ne11;
25
  int ne12;
26
  uint nb10;
27
  uint nb11;
28
  uint nb12;
 
29
  int ne0;
30
  int ne1;
31
  uint r2;
@@ -42,7 +44,7 @@ void main() {
42
  const uint i12 = im%pcs.ne12;
43
  const uint i13 = im/pcs.ne12;
44
 
45
- const uint offset0 = r0*pcs.nb01 + (i12/pcs.r2)*pcs.nb02 + (i13/pcs.r3)*pcs.nb02*pcs.ne02;
46
 
47
  const uint x = offset0 / 2 + pcs.inAOff; // Based from inA
48
 
@@ -52,7 +54,7 @@ void main() {
52
  break;
53
  }
54
 
55
- const uint y = (r1*pcs.nb11 + im*pcs.nb12) / 4 + pcs.inBOff; // Based from inB
56
 
57
  float sumf = 0;
58
  for (uint i = gl_SubgroupInvocationID.x; i < pcs.ne00; i += gl_SubgroupSize) {
 
20
  uint nb00;
21
  uint nb01;
22
  uint nb02;
23
+ uint nb03;
24
  int ne10;
25
  int ne11;
26
  int ne12;
27
  uint nb10;
28
  uint nb11;
29
  uint nb12;
30
+ uint nb13;
31
  int ne0;
32
  int ne1;
33
  uint r2;
 
44
  const uint i12 = im%pcs.ne12;
45
  const uint i13 = im/pcs.ne12;
46
 
47
+ const uint offset0 = r0*pcs.nb01 + (i12/pcs.r2)*pcs.nb02 + (i13/pcs.r3)*pcs.nb03;
48
 
49
  const uint x = offset0 / 2 + pcs.inAOff; // Based from inA
50
 
 
54
  break;
55
  }
56
 
57
+ const uint y = (r1*pcs.nb11 + i12*pcs.nb12 + i13*pcs.nb13) / 4 + pcs.inBOff;
58
 
59
  float sumf = 0;
60
  for (uint i = gl_SubgroupInvocationID.x; i < pcs.ne00; i += gl_SubgroupSize) {
ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp CHANGED
@@ -24,8 +24,14 @@ layout (push_constant) uniform parameter {
24
  int ne01;
25
  int ne02;
26
  int ne12;
27
- int r2;
28
- int r3;
 
 
 
 
 
 
29
  } pcs;
30
 
31
  void main() {
@@ -50,10 +56,11 @@ void main() {
50
  const uint i12 = im%pcs.ne12;
51
  const uint i13 = im/pcs.ne12;
52
 
53
- const uint offset0 = (i12/pcs.r2)*(nb*pcs.ne01) + (i13/pcs.r3)*(nb*pcs.ne01*pcs.ne02);
 
54
 
55
- const uint xblk = ib_row + offset0 + pcs.inAOff;
56
- const uint y = r1*pcs.ne10 + im*pcs.ne00*pcs.ne1 + pcs.inBOff;
57
 
58
  float yl[16];
59
  float yh[16];
@@ -74,7 +81,7 @@ void main() {
74
  }
75
 
76
  for (int row = 0; row < N_DST; row++) {
77
- uint row_idx = row * nb;
78
 
79
  uint16_t sc_0 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 0);
80
  uint16_t sc_1 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 2);
 
24
  int ne01;
25
  int ne02;
26
  int ne12;
27
+ uint nb01;
28
+ uint nb02;
29
+ uint nb03;
30
+ uint nb11;
31
+ uint nb12;
32
+ uint nb13;
33
+ uint r2;
34
+ uint r3;
35
  } pcs;
36
 
37
  void main() {
 
56
  const uint i12 = im%pcs.ne12;
57
  const uint i13 = im/pcs.ne12;
58
 
59
+ const uint offset0 = first_row*(pcs.nb01/SIZE_OF_BLOCK) + (i12/pcs.r2)*(pcs.nb02/SIZE_OF_BLOCK) + (i13/pcs.r3)*(pcs.nb03/SIZE_OF_BLOCK);
60
+ const uint offset1 = r1*pcs.nb11 + (i12 )*pcs.nb12 + (i13 )*pcs.nb13;
61
 
62
+ const uint xblk = offset0 + pcs.inAOff;
63
+ const uint y = (offset1 / 4) + pcs.inBOff;
64
 
65
  float yl[16];
66
  float yh[16];
 
81
  }
82
 
83
  for (int row = 0; row < N_DST; row++) {
84
+ uint row_idx = row * (pcs.nb01 / SIZE_OF_BLOCK);
85
 
86
  uint16_t sc_0 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 0);
87
  uint16_t sc_1 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 2);
ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp CHANGED
@@ -21,7 +21,16 @@ layout (push_constant) uniform parameter {
21
  int ne0;
22
  int ne1;
23
  int ne01;
24
- int gqa;
 
 
 
 
 
 
 
 
 
25
  } pcs;
26
 
27
  void main() {
@@ -34,12 +43,15 @@ void main() {
34
 
35
  const uint r0 = gl_WorkGroupID.x;
36
  const uint r1 = gl_WorkGroupID.y;
37
- const uint r2 = gl_WorkGroupID.z;
38
 
39
  const uint row = (r0 * gl_NumSubgroups + gl_SubgroupID);
40
- const uint offset0 = r2/pcs.gqa*(nb*pcs.ne0);
41
- const uint x = row * nb + offset0; // Based from inA without base offset
42
- const uint yy = r1*pcs.ne10 + r2*pcs.ne00*pcs.ne1+pcs.inBOff; // Based from inB
 
 
 
43
 
44
  float sumf = 0;
45
 
@@ -89,6 +101,6 @@ void main() {
89
 
90
  const float tot = subgroupAdd(sumf);
91
  if (subgroupElect()) {
92
- out_[r1*pcs.ne0 + r2*pcs.ne0*pcs.ne1 + row + pcs.outOff] = tot;
93
  }
94
  }
 
21
  int ne0;
22
  int ne1;
23
  int ne01;
24
+ int ne02;
25
+ int ne12;
26
+ uint nb01;
27
+ uint nb02;
28
+ uint nb03;
29
+ uint nb11;
30
+ uint nb12;
31
+ uint nb13;
32
+ uint r2;
33
+ uint r3;
34
  } pcs;
35
 
36
  void main() {
 
43
 
44
  const uint r0 = gl_WorkGroupID.x;
45
  const uint r1 = gl_WorkGroupID.y;
46
+ const uint im = gl_WorkGroupID.z;
47
 
48
  const uint row = (r0 * gl_NumSubgroups + gl_SubgroupID);
49
+
50
+ const uint i12 = im%pcs.ne12;
51
+ const uint i13 = im/pcs.ne12;
52
+
53
+ const uint x = row*(pcs.nb01/SIZE_OF_BLOCK) + (i12/pcs.r2)*(pcs.nb02/SIZE_OF_BLOCK) + (i13/pcs.r3)*(pcs.nb03/SIZE_OF_BLOCK);
54
+ const uint yy = (r1*pcs.nb11 + i12*pcs.nb12 + i13*pcs.nb13) / 4 + pcs.inBOff;
55
 
56
  float sumf = 0;
57
 
 
101
 
102
  const float tot = subgroupAdd(sumf);
103
  if (subgroupElect()) {
104
+ out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + row + pcs.outOff] = tot;
105
  }
106
  }
ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp CHANGED
@@ -14,10 +14,15 @@ void main() {
14
  const uint i12 = im%pcs.ne12;
15
  const uint i13 = im/pcs.ne12;
16
 
17
- const uint offset0 = first_row * nb + (i12/pcs.r2)*(nb*pcs.ne01) + (i13/pcs.r3)*(nb*pcs.ne01*pcs.ne02);
 
 
 
 
 
 
18
 
19
- const uint x = offset0; // Based from inA without base offset
20
- const uint y = r1*uint(pcs.ne10)+im*pcs.ne00*pcs.ne1+pcs.inBOff; // Based from inB
21
 
22
  float sumf[N_ROWS] = {0.0f, 0.0f, 0.0f, 0.0f};
23
 
@@ -32,8 +37,7 @@ void main() {
32
 
33
  for (uint ib = ix; ib < nb; ib += 16) {
34
  for (int row = 0; row < N_ROWS; row++) {
35
- const uint block_index = x + ib + row * nb;
36
- sumf[row] += block_q_n_dot_y(block_index, yb, il);
37
  }
38
 
39
  yb += BLOCKS_IN_QUANT * 16;
 
14
  const uint i12 = im%pcs.ne12;
15
  const uint i13 = im/pcs.ne12;
16
 
17
+ // pointers to src0 rows
18
+ uint ax[N_ROWS];
19
+ for (int row = 0; row < N_ROWS; ++row) {
20
+ const uint offset0 = (first_row + row)*(pcs.nb01/SIZE_OF_BLOCK) + (i12/pcs.r2)*(pcs.nb02/SIZE_OF_BLOCK) + (i13/pcs.r3)*(pcs.nb03/SIZE_OF_BLOCK);
21
+
22
+ ax[row] = offset0 + pcs.inAOff;
23
+ }
24
 
25
+ const uint y = (r1*pcs.nb11 + i12*pcs.nb12 + i13*pcs.nb13) / 4 + pcs.inBOff;
 
26
 
27
  float sumf[N_ROWS] = {0.0f, 0.0f, 0.0f, 0.0f};
28
 
 
37
 
38
  for (uint ib = ix; ib < nb; ib += 16) {
39
  for (int row = 0; row < N_ROWS; row++) {
40
+ sumf[row] += block_q_n_dot_y(ax[row] + ib, yb, il);
 
41
  }
42
 
43
  yb += BLOCKS_IN_QUANT * 16;
ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp CHANGED
@@ -1,5 +1,5 @@
1
  layout(local_size_x_id = 0) in;
2
- layout(local_size_y = 1) in;
3
  layout(local_size_z = 1) in;
4
 
5
  layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
@@ -17,6 +17,12 @@ layout (push_constant) uniform parameter {
17
  int ne12;
18
  int ne0;
19
  int ne1;
 
 
 
 
 
 
20
  uint r2;
21
  uint r3;
22
  } pcs;
 
1
  layout(local_size_x_id = 0) in;
2
+ layout(local_size_y = 8) in;
3
  layout(local_size_z = 1) in;
4
 
5
  layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
 
17
  int ne12;
18
  int ne0;
19
  int ne1;
20
+ uint nb01;
21
+ uint nb02;
22
+ uint nb03;
23
+ uint nb11;
24
+ uint nb12;
25
+ uint nb13;
26
  uint r2;
27
  uint r3;
28
  } pcs;
ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #version 450
2
+
3
+ #include "rope_common.comp"
4
+
5
+ layout(binding = 0) buffer restrict readonly tensorInA { float16_t inA[]; };
6
+ layout(binding = 1) buffer restrict readonly tensorInB { int inB[]; };
7
+ layout(binding = 2) buffer restrict readonly tensorInC { float inC[]; };
8
+ layout(binding = 3) buffer restrict writeonly tensorOut { float16_t out_[]; };
9
+
10
+ void main() {
11
+ const uint i3 = gl_WorkGroupID.z;
12
+ const uint i2 = gl_WorkGroupID.y;
13
+ const uint i1 = gl_WorkGroupID.x;
14
+
15
+ float corr_dims[2];
16
+ rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
17
+
18
+ const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
19
+
20
+ float theta_base = float(inB[pcs.inBOff + i2]);
21
+ float inv_ndims = -1.f/pcs.n_dims;
22
+
23
+ float cos_theta;
24
+ float sin_theta;
25
+
26
+ for (uint i0 = 2*gl_LocalInvocationIndex; i0 < pcs.ne0; i0 += 2*gl_WorkGroupSize.x) {
27
+ if (i0 < pcs.n_dims) {
28
+ uint ic = i0/2;
29
+
30
+ float theta = theta_base * pow(pcs.freq_base, inv_ndims*i0);
31
+
32
+ const float freq_factor = pcs.has_freq_factors ? inC[pcs.inCOff + ic] : 1.0f;
33
+
34
+ rope_yarn(theta/freq_factor, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
35
+
36
+ const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + ic*pcs.nb00) / 2) + pcs.inAOff; // Based from in
37
+ const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + ic*pcs.nb0) / 2) + pcs.outOff; // Based from out_
38
+
39
+ const float x0 = float(inA[src]);
40
+ const float x1 = float(inA[src+pcs.n_dims/2]);
41
+
42
+ out_[dst_data] = float16_t(x0*cos_theta - x1*sin_theta);
43
+ out_[dst_data+pcs.n_dims/2] = float16_t(x0*sin_theta + x1*cos_theta);
44
+ } else {
45
+ const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
46
+ const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 2) + pcs.outOff; // Based from out_
47
+
48
+ out_[dst_data] = inA[src];
49
+ out_[dst_data+1] = inA[src+1];
50
+ }
51
+ }
52
+ }
ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #version 450
2
+
3
+ #include "rope_common.comp"
4
+
5
+ layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
6
+ layout(binding = 1) buffer restrict readonly tensorInB { int inB[]; };
7
+ layout(binding = 2) buffer restrict readonly tensorInC { float inC[]; };
8
+ layout(binding = 3) buffer restrict writeonly tensorOut { float out_[]; };
9
+
10
+ void main() {
11
+ const uint i3 = gl_WorkGroupID.z;
12
+ const uint i2 = gl_WorkGroupID.y;
13
+ const uint i1 = gl_WorkGroupID.x;
14
+
15
+ float corr_dims[2];
16
+ rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
17
+
18
+ const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
19
+
20
+ float theta_base = float(inB[pcs.inBOff + i2]);
21
+ float inv_ndims = -1.f/pcs.n_dims;
22
+
23
+ float cos_theta;
24
+ float sin_theta;
25
+
26
+ for (uint i0 = 2*gl_LocalInvocationIndex; i0 < pcs.ne0; i0 += 2*gl_WorkGroupSize.x) {
27
+ if (i0 < pcs.n_dims) {
28
+ uint ic = i0/2;
29
+
30
+ float theta = theta_base * pow(pcs.freq_base, inv_ndims*i0);
31
+
32
+ const float freq_factor = pcs.has_freq_factors ? inC[pcs.inCOff + ic] : 1.0f;
33
+
34
+ rope_yarn(theta/freq_factor, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
35
+
36
+ const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + ic*pcs.nb00) / 4) + pcs.inAOff; // Based from in
37
+ const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + ic*pcs.nb0) / 4) + pcs.outOff; // Based from out_
38
+
39
+ const float x0 = inA[src];
40
+ const float x1 = inA[src+pcs.n_dims/2];
41
+
42
+ out_[dst_data] = x0*cos_theta - x1*sin_theta;
43
+ out_[dst_data+pcs.n_dims/2] = x0*sin_theta + x1*cos_theta;
44
+ } else {
45
+ const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
46
+ const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_
47
+
48
+ out_[dst_data] = inA[src];
49
+ out_[dst_data+1] = inA[src+1];
50
+ }
51
+ }
52
+ }
ggml/src/ggml-kompute/kompute-shaders/{op_rope_f16.comp → op_rope_norm_f16.comp} RENAMED
@@ -4,30 +4,34 @@
4
 
5
  layout(binding = 0) buffer restrict readonly tensorInA { float16_t inA[]; };
6
  layout(binding = 1) buffer restrict readonly tensorInB { int inB[]; };
7
- layout(binding = 2) buffer restrict writeonly tensorOut { float16_t out_[]; };
 
8
 
9
  void main() {
10
  const uint i3 = gl_WorkGroupID.z;
11
  const uint i2 = gl_WorkGroupID.y;
12
  const uint i1 = gl_WorkGroupID.x;
13
 
14
- const bool is_neox = (pcs.mode & GGML_ROPE_TYPE_NEOX) != 0;
15
-
16
  float corr_dims[2];
17
  rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
18
 
19
  const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
20
 
21
- const int p = inB[pcs.inBOff + i2];
 
 
 
 
22
 
23
- float theta = float(p);
 
 
24
 
25
- if (!is_neox) {
26
- for (uint i0 = 0; i0 < pcs.ne0; i0 += 2) {
27
- float cos_theta, sin_theta;
28
- rope_yarn(theta, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
29
 
30
- theta *= theta_scale;
 
 
31
 
32
  const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
33
  const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 2) + pcs.outOff; // Based from out_
@@ -37,37 +41,12 @@ void main() {
37
 
38
  out_[dst_data] = float16_t(x0*cos_theta - x1*sin_theta);
39
  out_[dst_data+1] = float16_t(x0*sin_theta + x1*cos_theta);
40
- }
41
- } else {
42
- const float inv_ndims = -1.f/pcs.n_dims;
43
- for (uint ic = 0; ic < pcs.n_dims; ic += 2) {
44
- const uint cur_rot = ic;
45
-
46
- float cos_theta, sin_theta;
47
- rope_yarn(theta, pcs.freq_scale, corr_dims, cur_rot, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
48
-
49
- theta *= theta_scale;
50
-
51
- const uint i0 = ic/2;
52
-
53
- const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
54
- const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 2) + pcs.outOff; // Based from out_
55
-
56
- const float x0 = float(inA[src]);
57
- const float x1 = float(inA[src+pcs.n_dims/2]);
58
-
59
- out_[dst_data] = float16_t(x0*cos_theta - x1*sin_theta);
60
- out_[dst_data+pcs.n_dims/2] = float16_t(x0*sin_theta + x1*cos_theta);
61
- }
62
-
63
- for (uint ic = pcs.n_dims; ic < pcs.ne0; ic += 2) {
64
- const uint i0 = ic;
65
-
66
  const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
67
  const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 2) + pcs.outOff; // Based from out_
68
 
69
- out_[dst_data + 0] = inA[src + 0];
70
- out_[dst_data + 1] = inA[src + 1];
71
  }
72
  }
73
  }
 
4
 
5
  layout(binding = 0) buffer restrict readonly tensorInA { float16_t inA[]; };
6
  layout(binding = 1) buffer restrict readonly tensorInB { int inB[]; };
7
+ layout(binding = 2) buffer restrict readonly tensorInC { float inC[]; };
8
+ layout(binding = 3) buffer restrict writeonly tensorOut { float16_t out_[]; };
9
 
10
  void main() {
11
  const uint i3 = gl_WorkGroupID.z;
12
  const uint i2 = gl_WorkGroupID.y;
13
  const uint i1 = gl_WorkGroupID.x;
14
 
 
 
15
  float corr_dims[2];
16
  rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
17
 
18
  const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
19
 
20
+ float theta_base = float(inB[pcs.inBOff + i2]);
21
+ float inv_ndims = -1.f/pcs.n_dims;
22
+
23
+ float cos_theta;
24
+ float sin_theta;
25
 
26
+ for (uint i0 = 2*gl_LocalInvocationIndex; i0 < pcs.ne0; i0 += 2*gl_WorkGroupSize.x) {
27
+ if (i0 < pcs.n_dims) {
28
+ uint ic = i0/2;
29
 
30
+ float theta = theta_base * pow(pcs.freq_base, inv_ndims*i0);
 
 
 
31
 
32
+ const float freq_factor = pcs.has_freq_factors ? inC[pcs.inCOff + ic] : 1.0f;
33
+
34
+ rope_yarn(theta/freq_factor, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
35
 
36
  const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
37
  const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 2) + pcs.outOff; // Based from out_
 
41
 
42
  out_[dst_data] = float16_t(x0*cos_theta - x1*sin_theta);
43
  out_[dst_data+1] = float16_t(x0*sin_theta + x1*cos_theta);
44
+ } else {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
46
  const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 2) + pcs.outOff; // Based from out_
47
 
48
+ out_[dst_data] = inA[src];
49
+ out_[dst_data+1] = inA[src+1];
50
  }
51
  }
52
  }
ggml/src/ggml-kompute/kompute-shaders/{op_rope_f32.comp → op_rope_norm_f32.comp} RENAMED
@@ -4,30 +4,34 @@
4
 
5
  layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
6
  layout(binding = 1) buffer restrict readonly tensorInB { int inB[]; };
7
- layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; };
 
8
 
9
  void main() {
10
  const uint i3 = gl_WorkGroupID.z;
11
  const uint i2 = gl_WorkGroupID.y;
12
  const uint i1 = gl_WorkGroupID.x;
13
 
14
- const bool is_neox = (pcs.mode & GGML_ROPE_TYPE_NEOX) != 0;
15
-
16
  float corr_dims[2];
17
  rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
18
 
19
  const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
20
 
21
- const int p = inB[pcs.inBOff + i2];
 
 
 
 
22
 
23
- float theta = float(p);
 
 
24
 
25
- if (!is_neox) {
26
- for (uint i0 = 0; i0 < pcs.ne0; i0 += 2) {
27
- float cos_theta, sin_theta;
28
- rope_yarn(theta, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
29
 
30
- theta *= theta_scale;
 
 
31
 
32
  const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
33
  const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_
@@ -37,37 +41,12 @@ void main() {
37
 
38
  out_[dst_data] = x0*cos_theta - x1*sin_theta;
39
  out_[dst_data+1] = x0*sin_theta + x1*cos_theta;
40
- }
41
- } else {
42
- const float inv_ndims = -1.f/pcs.n_dims;
43
- for (uint ic = 0; ic < pcs.n_dims; ic += 2) {
44
- const uint cur_rot = ic;
45
-
46
- float cos_theta, sin_theta;
47
- rope_yarn(theta, pcs.freq_scale, corr_dims, cur_rot, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
48
-
49
- theta *= theta_scale;
50
-
51
- const uint i0 = ic/2;
52
-
53
  const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
54
  const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_
55
 
56
- const float x0 = inA[src];
57
- const float x1 = inA[src+pcs.n_dims/2];
58
-
59
- out_[dst_data] = x0*cos_theta - x1*sin_theta;
60
- out_[dst_data+pcs.n_dims/2] = x0*sin_theta + x1*cos_theta;
61
- }
62
-
63
- for (uint ic = pcs.n_dims; ic < pcs.ne0; ic += 2) {
64
- const uint i0 = ic;
65
-
66
- const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
67
- const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_
68
-
69
- out_[dst_data + 0] = inA[src + 0];
70
- out_[dst_data + 1] = inA[src + 1];
71
  }
72
  }
73
  }
 
4
 
5
  layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
6
  layout(binding = 1) buffer restrict readonly tensorInB { int inB[]; };
7
+ layout(binding = 2) buffer restrict readonly tensorInC { float inC[]; };
8
+ layout(binding = 3) buffer restrict writeonly tensorOut { float out_[]; };
9
 
10
  void main() {
11
  const uint i3 = gl_WorkGroupID.z;
12
  const uint i2 = gl_WorkGroupID.y;
13
  const uint i1 = gl_WorkGroupID.x;
14
 
 
 
15
  float corr_dims[2];
16
  rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
17
 
18
  const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
19
 
20
+ float theta_base = float(inB[pcs.inBOff + i2]);
21
+ float inv_ndims = -1.f/pcs.n_dims;
22
+
23
+ float cos_theta;
24
+ float sin_theta;
25
 
26
+ for (uint i0 = 2*gl_LocalInvocationIndex; i0 < pcs.ne0; i0 += 2*gl_WorkGroupSize.x) {
27
+ if (i0 < pcs.n_dims) {
28
+ uint ic = i0/2;
29
 
30
+ float theta = theta_base * pow(pcs.freq_base, inv_ndims*i0);
 
 
 
31
 
32
+ const float freq_factor = pcs.has_freq_factors ? inC[pcs.inCOff + ic] : 1.0f;
33
+
34
+ rope_yarn(theta/freq_factor, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
35
 
36
  const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
37
  const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_
 
41
 
42
  out_[dst_data] = x0*cos_theta - x1*sin_theta;
43
  out_[dst_data+1] = x0*sin_theta + x1*cos_theta;
44
+ } else {
 
 
 
 
 
 
 
 
 
 
 
 
45
  const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
46
  const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_
47
 
48
+ out_[dst_data] = inA[src];
49
+ out_[dst_data+1] = inA[src+1];
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  }
51
  }
52
  }
ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp CHANGED
@@ -18,6 +18,10 @@ layout(push_constant) uniform PushConstants {
18
  int ne01;
19
  int ne02;
20
  float scale;
 
 
 
 
21
  int mask;
22
  } pcs;
23
 
@@ -34,17 +38,29 @@ void main() {
34
  const uint pmask = i01*pcs.ne00 + pcs.inBOff; // Based from inB
35
  const uint pdst = extra_off + pcs.outOff; // Based from out_
36
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  // parallel max
38
  float localMax = uintBitsToFloat(0xFF800000);
39
  for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) {
40
- localMax = max(localMax, inA[psrc0 + i00]*pcs.scale + (pcs.mask!=0 ? inB[pmask + i00] : 0.0f));
41
  }
42
  float max_ = subgroupMax(localMax);
43
 
44
  // parallel sum
45
  float localSum = 0.0f;
46
  for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) {
47
- const float exp_psrc0 = exp(inA[psrc0 + i00]*pcs.scale + (pcs.mask!=0 ? inB[pmask + i00] : 0.0f) - max_);
48
  localSum += exp_psrc0;
49
  out_[pdst + i00] = exp_psrc0;
50
  }
 
18
  int ne01;
19
  int ne02;
20
  float scale;
21
+ float max_bias;
22
+ float m0;
23
+ float m1;
24
+ uint n_head_log2;
25
  int mask;
26
  } pcs;
27
 
 
38
  const uint pmask = i01*pcs.ne00 + pcs.inBOff; // Based from inB
39
  const uint pdst = extra_off + pcs.outOff; // Based from out_
40
 
41
+ float slope = 1.0f;
42
+
43
+ // ALiBi
44
+ if (pcs.max_bias > 0.0f) {
45
+ int64_t h = i02;
46
+
47
+ float base = h < pcs.n_head_log2 ? pcs.m0 : pcs.m1;
48
+ int64_t exp = h < pcs.n_head_log2 ? h + 1 : 2*(h - pcs.n_head_log2) + 1;
49
+
50
+ slope = pow(base, float(exp));
51
+ }
52
+
53
  // parallel max
54
  float localMax = uintBitsToFloat(0xFF800000);
55
  for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) {
56
+ localMax = max(localMax, inA[psrc0 + i00]*pcs.scale + (pcs.mask!=0 ? slope*inB[pmask + i00] : 0.0f));
57
  }
58
  float max_ = subgroupMax(localMax);
59
 
60
  // parallel sum
61
  float localSum = 0.0f;
62
  for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) {
63
+ const float exp_psrc0 = exp(inA[psrc0 + i00]*pcs.scale + (pcs.mask!=0 ? slope*inB[pmask + i00] : 0.0f) - max_);
64
  localSum += exp_psrc0;
65
  out_[pdst + i00] = exp_psrc0;
66
  }
ggml/src/ggml-kompute/kompute-shaders/rope_common.comp CHANGED
@@ -8,12 +8,14 @@ layout(local_size_x = 1) in;
8
  layout (push_constant) uniform parameter {
9
  uint inAOff;
10
  uint inBOff;
 
11
  uint outOff;
12
  int n_dims;
13
  int mode;
14
  int n_ctx_orig;
15
  float freq_base;
16
  float freq_scale;
 
17
  float ext_factor;
18
  float attn_factor;
19
  float beta_fast;
 
8
  layout (push_constant) uniform parameter {
9
  uint inAOff;
10
  uint inBOff;
11
+ uint inCOff;
12
  uint outOff;
13
  int n_dims;
14
  int mode;
15
  int n_ctx_orig;
16
  float freq_base;
17
  float freq_scale;
18
+ bool has_freq_factors;
19
  float ext_factor;
20
  float attn_factor;
21
  float beta_fast;