Spaces:
Running
Running
kompute : improve backend to pass test_backend_ops (llama/10542)
Browse files* kompute: op_unary: reject unsupported parameters
Signed-off-by: Sergio Lopez <[email protected]>
* kompute: softmax: implement ALiBi support
Signed-off-by: Sergio Lopez <[email protected]>
* kompute: rope: implement neox and phi3 support
Signed-off-by: Sergio Lopez <[email protected]>
* kompute: op_mul_mat_q4_k permutted support
Signed-off-by: Sergio Lopez <[email protected]>
* kompute: op_mul_mat_[q4_0|q4_1|q8_0] permutted support
Signed-off-by: Sergio Lopez <[email protected]>
* kompute: op_mul_mat_f16 permutted support
Signed-off-by: Sergio Lopez <[email protected]>
* kompute: op_mul_mat_q6_k permutted support
Signed-off-by: Sergio Lopez <[email protected]>
---------
Signed-off-by: Sergio Lopez <[email protected]>
- ggml/src/ggml-kompute/CMakeLists.txt +8 -4
- ggml/src/ggml-kompute/ggml-kompute.cpp +115 -61
- ggml/src/ggml-kompute/kompute-shaders/common.comp +1 -0
- ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +4 -2
- ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +13 -6
- ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +18 -6
- ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +9 -5
- ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +7 -1
- ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +52 -0
- ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +52 -0
- ggml/src/ggml-kompute/kompute-shaders/{op_rope_f16.comp → op_rope_norm_f16.comp} +17 -38
- ggml/src/ggml-kompute/kompute-shaders/{op_rope_f32.comp → op_rope_norm_f32.comp} +17 -38
- ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +18 -2
- ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +2 -0
ggml/src/ggml-kompute/CMakeLists.txt
CHANGED
|
@@ -105,8 +105,10 @@ if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/kompute/CMakeLists.txt")
|
|
| 105 |
kompute-shaders/op_getrows_q4_0.comp
|
| 106 |
kompute-shaders/op_getrows_q4_1.comp
|
| 107 |
kompute-shaders/op_getrows_q6_k.comp
|
| 108 |
-
kompute-shaders/
|
| 109 |
-
kompute-shaders/
|
|
|
|
|
|
|
| 110 |
kompute-shaders/op_cpy_f16_f16.comp
|
| 111 |
kompute-shaders/op_cpy_f16_f32.comp
|
| 112 |
kompute-shaders/op_cpy_f32_f16.comp
|
|
@@ -139,8 +141,10 @@ if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/kompute/CMakeLists.txt")
|
|
| 139 |
shaderop_getrows_q4_0.h
|
| 140 |
shaderop_getrows_q4_1.h
|
| 141 |
shaderop_getrows_q6_k.h
|
| 142 |
-
|
| 143 |
-
|
|
|
|
|
|
|
| 144 |
shaderop_cpy_f16_f16.h
|
| 145 |
shaderop_cpy_f16_f32.h
|
| 146 |
shaderop_cpy_f32_f16.h
|
|
|
|
| 105 |
kompute-shaders/op_getrows_q4_0.comp
|
| 106 |
kompute-shaders/op_getrows_q4_1.comp
|
| 107 |
kompute-shaders/op_getrows_q6_k.comp
|
| 108 |
+
kompute-shaders/op_rope_norm_f16.comp
|
| 109 |
+
kompute-shaders/op_rope_norm_f32.comp
|
| 110 |
+
kompute-shaders/op_rope_neox_f16.comp
|
| 111 |
+
kompute-shaders/op_rope_neox_f32.comp
|
| 112 |
kompute-shaders/op_cpy_f16_f16.comp
|
| 113 |
kompute-shaders/op_cpy_f16_f32.comp
|
| 114 |
kompute-shaders/op_cpy_f32_f16.comp
|
|
|
|
| 141 |
shaderop_getrows_q4_0.h
|
| 142 |
shaderop_getrows_q4_1.h
|
| 143 |
shaderop_getrows_q6_k.h
|
| 144 |
+
shaderop_rope_norm_f16.h
|
| 145 |
+
shaderop_rope_norm_f32.h
|
| 146 |
+
shaderop_rope_neox_f16.h
|
| 147 |
+
shaderop_rope_neox_f32.h
|
| 148 |
shaderop_cpy_f16_f16.h
|
| 149 |
shaderop_cpy_f16_f32.h
|
| 150 |
shaderop_cpy_f32_f16.h
|
ggml/src/ggml-kompute/ggml-kompute.cpp
CHANGED
|
@@ -28,8 +28,10 @@
|
|
| 28 |
#include "shaderop_getrows_q4_0.h"
|
| 29 |
#include "shaderop_getrows_q4_1.h"
|
| 30 |
#include "shaderop_getrows_q6_k.h"
|
| 31 |
-
#include "
|
| 32 |
-
#include "
|
|
|
|
|
|
|
| 33 |
#include "shaderop_cpy_f16_f16.h"
|
| 34 |
#include "shaderop_cpy_f16_f32.h"
|
| 35 |
#include "shaderop_cpy_f32_f16.h"
|
|
@@ -345,7 +347,7 @@ void ggml_vk_allocate_descriptor_pool(struct ggml_kompute_context * ctx, size_t
|
|
| 345 |
std::vector<vk::DescriptorPoolSize> descriptorPoolSizes = {
|
| 346 |
vk::DescriptorPoolSize(
|
| 347 |
vk::DescriptorType::eStorageBuffer,
|
| 348 |
-
|
| 349 |
)
|
| 350 |
};
|
| 351 |
|
|
@@ -788,7 +790,8 @@ static void ggml_vk_soft_max(
|
|
| 788 |
const std::shared_ptr<kp::Tensor>& out,
|
| 789 |
uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
|
| 790 |
int32_t ne00, int32_t ne01, int32_t ne02, uint32_t ne03,
|
| 791 |
-
float scale
|
|
|
|
| 792 |
) {
|
| 793 |
const static auto spirv = getSpirvShader(kp::shader_data::op_softmax_comp_spv,
|
| 794 |
kp::shader_data::op_softmax_comp_spv_len);
|
|
@@ -796,12 +799,14 @@ static void ggml_vk_soft_max(
|
|
| 796 |
struct PushConstants {
|
| 797 |
uint32_t inAOff, inBOff, outOff;
|
| 798 |
int32_t ne00, ne01, ne02;
|
| 799 |
-
float scale;
|
|
|
|
| 800 |
int32_t mask;
|
| 801 |
} pushConsts {
|
| 802 |
safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4),
|
| 803 |
ne00, ne01, ne02,
|
| 804 |
-
scale,
|
|
|
|
| 805 |
bool(inB)
|
| 806 |
};
|
| 807 |
|
|
@@ -911,9 +916,9 @@ static void ggml_vk_mul_mat_f16(
|
|
| 911 |
const std::shared_ptr<kp::Tensor>& out,
|
| 912 |
uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
|
| 913 |
int32_t ne00, int32_t ne01, int32_t ne02,
|
| 914 |
-
uint32_t nb00, uint32_t nb01, uint32_t nb02,
|
| 915 |
int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13,
|
| 916 |
-
uint32_t nb10, uint32_t nb11, uint32_t nb12,
|
| 917 |
int32_t ne0, int32_t ne1,
|
| 918 |
uint32_t r2, uint32_t r3
|
| 919 |
) {
|
|
@@ -923,17 +928,17 @@ static void ggml_vk_mul_mat_f16(
|
|
| 923 |
struct PushConstants {
|
| 924 |
uint32_t inAOff, inBOff, outOff;
|
| 925 |
int32_t ne00, ne01, ne02;
|
| 926 |
-
uint32_t nb00, nb01, nb02;
|
| 927 |
int32_t ne10, ne11, ne12;
|
| 928 |
-
uint32_t nb10, nb11, nb12;
|
| 929 |
int32_t ne0, ne1;
|
| 930 |
uint32_t r2, r3;
|
| 931 |
} pushConsts {
|
| 932 |
safe_divide(inAOff, 2), safe_divide(inBOff, 4), safe_divide(outOff, 4),
|
| 933 |
ne00, ne01, ne02,
|
| 934 |
-
nb00, nb01, nb02,
|
| 935 |
ne10, ne11, ne12,
|
| 936 |
-
nb10, nb11, nb12,
|
| 937 |
ne0, ne1,
|
| 938 |
r2, r3
|
| 939 |
};
|
|
@@ -1013,6 +1018,8 @@ static void ggml_vk_mul_mat_impl(
|
|
| 1013 |
int32_t ne00, int32_t ne01, int32_t ne02,
|
| 1014 |
int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13,
|
| 1015 |
int32_t ne0, int32_t ne1,
|
|
|
|
|
|
|
| 1016 |
uint32_t r2, uint32_t r3
|
| 1017 |
) {
|
| 1018 |
struct PushConstants {
|
|
@@ -1020,19 +1027,23 @@ static void ggml_vk_mul_mat_impl(
|
|
| 1020 |
int32_t ne00, ne01, ne02;
|
| 1021 |
int32_t ne10, ne12;
|
| 1022 |
int32_t ne0, ne1;
|
|
|
|
|
|
|
| 1023 |
uint32_t r2, r3;
|
| 1024 |
} pushConsts {
|
| 1025 |
safe_divide(inAOff, block_size), safe_divide(inBOff, 4), safe_divide(outOff, 4),
|
| 1026 |
ne00, ne01, ne02,
|
| 1027 |
ne10, ne12,
|
| 1028 |
ne0, ne1,
|
|
|
|
|
|
|
| 1029 |
r2, r3
|
| 1030 |
};
|
| 1031 |
|
| 1032 |
auto name = std::string(__func__) + "_" + suffix;
|
| 1033 |
std::shared_ptr<kp::Algorithm> s_algo = nullptr;
|
| 1034 |
if (!komputeManager()->hasAlgorithm(name)) {
|
| 1035 |
-
const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2;
|
| 1036 |
s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(name, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 7)/8), unsigned(ne11), unsigned(ne12*ne13)}, {local_x}, {pushConsts});
|
| 1037 |
} else {
|
| 1038 |
s_algo = komputeManager()->getAlgorithm(name);
|
|
@@ -1074,19 +1085,26 @@ static void ggml_vk_mul_mat_q4_k(
|
|
| 1074 |
const std::shared_ptr<kp::Tensor>& inB,
|
| 1075 |
const std::shared_ptr<kp::Tensor>& out,
|
| 1076 |
uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
|
| 1077 |
-
int32_t ne00, int32_t ne01, int32_t ne02,
|
| 1078 |
-
int32_t
|
| 1079 |
-
int32_t
|
|
|
|
|
|
|
|
|
|
| 1080 |
) {
|
| 1081 |
const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q4_k_comp_spv,
|
| 1082 |
kp::shader_data::op_mul_mat_q4_k_comp_spv_len);
|
| 1083 |
|
| 1084 |
struct PushConstants {
|
| 1085 |
uint32_t inAOff, inBOff, outOff;
|
| 1086 |
-
int32_t ne00, ne10, ne0, ne1, ne01, ne02, ne12
|
|
|
|
|
|
|
| 1087 |
} pushConsts {
|
| 1088 |
-
|
| 1089 |
-
ne00, ne10, ne0, ne1, ne01, ne02, ne12,
|
|
|
|
|
|
|
| 1090 |
};
|
| 1091 |
|
| 1092 |
std::shared_ptr<kp::Algorithm> s_algo = nullptr;
|
|
@@ -1108,28 +1126,37 @@ static void ggml_vk_mul_mat_q6_k(
|
|
| 1108 |
const std::shared_ptr<kp::Tensor>& inB,
|
| 1109 |
const std::shared_ptr<kp::Tensor>& out,
|
| 1110 |
uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
|
| 1111 |
-
int32_t ne00, int32_t
|
| 1112 |
-
int32_t
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1113 |
) {
|
| 1114 |
const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q6_k_comp_spv,
|
| 1115 |
kp::shader_data::op_mul_mat_q6_k_comp_spv_len);
|
| 1116 |
|
| 1117 |
struct PushConstants {
|
| 1118 |
uint32_t inAOff, inBOff, outOff;
|
| 1119 |
-
int32_t ne00, ne10, ne0, ne1, ne01,
|
|
|
|
|
|
|
| 1120 |
} pushConsts {
|
| 1121 |
inAOff, safe_divide(inBOff, 4), safe_divide(outOff, 4),
|
| 1122 |
-
ne00, ne10, ne0, ne1, ne01, ne12
|
|
|
|
|
|
|
| 1123 |
};
|
| 1124 |
|
| 1125 |
std::shared_ptr<kp::Algorithm> s_algo = nullptr;
|
| 1126 |
if (!komputeManager()->hasAlgorithm(__func__)) {
|
| 1127 |
-
const uint32_t local_x =
|
| 1128 |
-
|
|
|
|
| 1129 |
} else {
|
| 1130 |
s_algo = komputeManager()->getAlgorithm(__func__);
|
| 1131 |
s_algo->setTensors({inA, inB, out});
|
| 1132 |
-
s_algo->setWorkgroup({unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)});
|
| 1133 |
s_algo->setPushConstants<PushConstants>({pushConsts});
|
| 1134 |
s_algo->updateDescriptors(s_kompute_context->pool.get());
|
| 1135 |
}
|
|
@@ -1217,10 +1244,11 @@ static void ggml_vk_rope(
|
|
| 1217 |
kp::Sequence& seq,
|
| 1218 |
const std::shared_ptr<kp::Tensor>& inA,
|
| 1219 |
const std::shared_ptr<kp::Tensor>& inB,
|
|
|
|
| 1220 |
const std::shared_ptr<kp::Tensor>& out,
|
| 1221 |
-
uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
|
| 1222 |
ggml_type src0t, int32_t n_dims, int32_t mode, int32_t n_ctx_orig,
|
| 1223 |
-
float freq_base, float freq_scale, float ext_factor, float attn_factor, float beta_fast, float beta_slow,
|
| 1224 |
int32_t ne01, int32_t ne02, int32_t ne03,
|
| 1225 |
uint32_t nb00, uint32_t nb01, uint32_t nb02, uint32_t nb03,
|
| 1226 |
int32_t ne0,
|
|
@@ -1228,11 +1256,17 @@ static void ggml_vk_rope(
|
|
| 1228 |
) {
|
| 1229 |
GGML_ASSERT(src0t == GGML_TYPE_F16 || src0t == GGML_TYPE_F32);
|
| 1230 |
|
| 1231 |
-
static const auto
|
| 1232 |
-
kp::shader_data::
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1233 |
);
|
| 1234 |
-
static const auto
|
| 1235 |
-
kp::shader_data::
|
| 1236 |
);
|
| 1237 |
|
| 1238 |
int type_size = src0t == GGML_TYPE_F16 ? 2 : 4;
|
|
@@ -1247,32 +1281,40 @@ static void ggml_vk_rope(
|
|
| 1247 |
GGML_ASSERT(nb0 % type_size == 0);
|
| 1248 |
|
| 1249 |
struct PushConstants {
|
| 1250 |
-
uint32_t inAOff, inBOff, outOff;
|
| 1251 |
int32_t n_dims, mode, n_ctx_orig;
|
| 1252 |
-
float freq_base, freq_scale
|
|
|
|
|
|
|
| 1253 |
uint32_t nb00, nb01, nb02, nb03;
|
| 1254 |
int32_t ne0;
|
| 1255 |
uint32_t nb0, nb1, nb2, nb3;
|
| 1256 |
} pushConsts {
|
| 1257 |
-
safe_divide(inAOff, type_size), safe_divide(inBOff, 4), safe_divide(outOff, type_size),
|
| 1258 |
n_dims, mode, n_ctx_orig,
|
| 1259 |
-
freq_base, freq_scale,
|
|
|
|
|
|
|
| 1260 |
nb00, nb01, nb02, nb03,
|
| 1261 |
ne0,
|
| 1262 |
nb0, nb1, nb2, nb3
|
| 1263 |
};
|
| 1264 |
|
| 1265 |
-
auto
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1266 |
std::shared_ptr<kp::Algorithm> s_algo = nullptr;
|
| 1267 |
if (!komputeManager()->hasAlgorithm(name)) {
|
|
|
|
| 1268 |
s_algo = komputeManager()->algorithm<float, PushConstants>(
|
| 1269 |
-
name, s_kompute_context->pool.get(), {inA, inB, out},
|
| 1270 |
-
src0t == GGML_TYPE_F16 ? spirv_f16 : spirv_f32,
|
| 1271 |
{unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts}
|
| 1272 |
);
|
| 1273 |
} else {
|
| 1274 |
s_algo = komputeManager()->getAlgorithm(name);
|
| 1275 |
-
s_algo->setTensors({inA, inB, out});
|
| 1276 |
s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
|
| 1277 |
s_algo->setPushConstants<PushConstants>({pushConsts});
|
| 1278 |
s_algo->updateDescriptors(s_kompute_context->pool.get());
|
|
@@ -1351,11 +1393,15 @@ static void ggml_vk_cpy_f16_f32(Args&&... args) {
|
|
| 1351 |
}
|
| 1352 |
|
| 1353 |
static bool ggml_backend_kompute_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
|
|
|
|
| 1354 |
switch (op->op) {
|
| 1355 |
case GGML_OP_UNARY:
|
|
|
|
| 1356 |
switch (ggml_get_unary_op(op)) {
|
| 1357 |
-
case GGML_UNARY_OP_RELU:
|
| 1358 |
case GGML_UNARY_OP_GELU:
|
|
|
|
|
|
|
|
|
|
| 1359 |
case GGML_UNARY_OP_SILU:
|
| 1360 |
return ggml_is_contiguous(op->src[0]);
|
| 1361 |
default:
|
|
@@ -1413,8 +1459,8 @@ static bool ggml_backend_kompute_device_supports_op(ggml_backend_dev_t dev, cons
|
|
| 1413 |
|
| 1414 |
switch (op->src[0]->type) {
|
| 1415 |
case GGML_TYPE_F32:
|
| 1416 |
-
case GGML_TYPE_Q6_K:
|
| 1417 |
return op->ne[3] == 1;
|
|
|
|
| 1418 |
case GGML_TYPE_F16:
|
| 1419 |
case GGML_TYPE_Q8_0:
|
| 1420 |
case GGML_TYPE_Q4_0:
|
|
@@ -1515,9 +1561,11 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
|
|
| 1515 |
const static std::shared_ptr<kp::Tensor> nullTensor = nullptr;
|
| 1516 |
uint32_t off_src0 = 0;
|
| 1517 |
uint32_t off_src1 = 0;
|
|
|
|
| 1518 |
uint32_t off_dst = 0;
|
| 1519 |
const std::shared_ptr<kp::Tensor>& id_src0 = src0 ? ggml_vk_get_tensor(src0, &off_src0) : nullTensor;
|
| 1520 |
const std::shared_ptr<kp::Tensor>& id_src1 = src1 ? ggml_vk_get_tensor(src1, &off_src1) : nullTensor;
|
|
|
|
| 1521 |
const std::shared_ptr<kp::Tensor>& id_dst = dst ? ggml_vk_get_tensor(dst, &off_dst) : nullTensor;
|
| 1522 |
|
| 1523 |
switch (dst->op) {
|
|
@@ -1593,11 +1641,16 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
|
|
| 1593 |
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5021")
|
| 1594 |
GGML_ASSERT(!src1 || src1t == GGML_TYPE_F32);
|
| 1595 |
|
| 1596 |
-
|
| 1597 |
-
|
| 1598 |
-
|
|
|
|
|
|
|
| 1599 |
|
| 1600 |
-
|
|
|
|
|
|
|
|
|
|
| 1601 |
} break;
|
| 1602 |
case GGML_OP_DIAG_MASK_INF:
|
| 1603 |
{
|
|
@@ -1649,38 +1702,44 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
|
|
| 1649 |
case GGML_TYPE_F16:
|
| 1650 |
ggml_vk_mul_mat_f16(
|
| 1651 |
seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
|
| 1652 |
-
ne00, ne01, ne02, nb00, nb01, nb02,
|
|
|
|
| 1653 |
ne0, ne1, r2, r3
|
| 1654 |
);
|
| 1655 |
break;
|
| 1656 |
case GGML_TYPE_Q8_0:
|
| 1657 |
ggml_vk_mul_mat_q8_0(
|
| 1658 |
seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
|
| 1659 |
-
ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1,
|
|
|
|
| 1660 |
);
|
| 1661 |
break;
|
| 1662 |
case GGML_TYPE_Q4_0:
|
| 1663 |
ggml_vk_mul_mat_q4_0(
|
| 1664 |
seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
|
| 1665 |
-
ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1,
|
|
|
|
| 1666 |
);
|
| 1667 |
break;
|
| 1668 |
case GGML_TYPE_Q4_1:
|
| 1669 |
ggml_vk_mul_mat_q4_1(
|
| 1670 |
seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
|
| 1671 |
-
ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1,
|
|
|
|
| 1672 |
);
|
| 1673 |
break;
|
| 1674 |
case GGML_TYPE_Q4_K:
|
| 1675 |
ggml_vk_mul_mat_q4_k(
|
| 1676 |
seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
|
| 1677 |
-
ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1,
|
|
|
|
| 1678 |
);
|
| 1679 |
break;
|
| 1680 |
case GGML_TYPE_Q6_K:
|
| 1681 |
ggml_vk_mul_mat_q6_k(
|
| 1682 |
seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
|
| 1683 |
-
ne00, ne10,
|
|
|
|
| 1684 |
);
|
| 1685 |
break;
|
| 1686 |
default: {
|
|
@@ -1709,13 +1768,6 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
|
|
| 1709 |
} break;
|
| 1710 |
case GGML_OP_ROPE:
|
| 1711 |
{
|
| 1712 |
-
#pragma message("TODO: implement phi3 frequency factors support")
|
| 1713 |
-
#pragma message(" https://github.com/ggerganov/llama.cpp/pull/7225")
|
| 1714 |
-
GGML_ASSERT(dst->src[2] == nullptr && "phi3 frequency factors not implemented yet");
|
| 1715 |
-
|
| 1716 |
-
#pragma message("TODO: update rope NORM mode to match NEOX mode")
|
| 1717 |
-
#pragma message(" https://github.com/ggerganov/llama.cpp/pull/7634")
|
| 1718 |
-
|
| 1719 |
GGML_ASSERT(ne10 == ne02);
|
| 1720 |
GGML_ASSERT(src0t == dstt);
|
| 1721 |
// const int n_past = ((int32_t *) dst->op_params)[0];
|
|
@@ -1724,6 +1776,8 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
|
|
| 1724 |
// skip 3, n_ctx used in GLM RoPE, unimplemented in Vulkan
|
| 1725 |
const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
|
| 1726 |
|
|
|
|
|
|
|
| 1727 |
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
| 1728 |
memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
|
| 1729 |
memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
|
|
@@ -1732,8 +1786,8 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
|
|
| 1732 |
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
| 1733 |
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
| 1734 |
ggml_vk_rope(
|
| 1735 |
-
seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, src0t, n_dims, mode, n_ctx_orig,
|
| 1736 |
-
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow,
|
| 1737 |
ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, nb0, nb1, nb2, nb3
|
| 1738 |
);
|
| 1739 |
} break;
|
|
|
|
| 28 |
#include "shaderop_getrows_q4_0.h"
|
| 29 |
#include "shaderop_getrows_q4_1.h"
|
| 30 |
#include "shaderop_getrows_q6_k.h"
|
| 31 |
+
#include "shaderop_rope_norm_f16.h"
|
| 32 |
+
#include "shaderop_rope_norm_f32.h"
|
| 33 |
+
#include "shaderop_rope_neox_f16.h"
|
| 34 |
+
#include "shaderop_rope_neox_f32.h"
|
| 35 |
#include "shaderop_cpy_f16_f16.h"
|
| 36 |
#include "shaderop_cpy_f16_f32.h"
|
| 37 |
#include "shaderop_cpy_f32_f16.h"
|
|
|
|
| 347 |
std::vector<vk::DescriptorPoolSize> descriptorPoolSizes = {
|
| 348 |
vk::DescriptorPoolSize(
|
| 349 |
vk::DescriptorType::eStorageBuffer,
|
| 350 |
+
4 * size // Descriptor count is number of possible tensors to pass into an algorithm
|
| 351 |
)
|
| 352 |
};
|
| 353 |
|
|
|
|
| 790 |
const std::shared_ptr<kp::Tensor>& out,
|
| 791 |
uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
|
| 792 |
int32_t ne00, int32_t ne01, int32_t ne02, uint32_t ne03,
|
| 793 |
+
float scale, float max_bias, float m0, float m1,
|
| 794 |
+
uint32_t n_head_log2
|
| 795 |
) {
|
| 796 |
const static auto spirv = getSpirvShader(kp::shader_data::op_softmax_comp_spv,
|
| 797 |
kp::shader_data::op_softmax_comp_spv_len);
|
|
|
|
| 799 |
struct PushConstants {
|
| 800 |
uint32_t inAOff, inBOff, outOff;
|
| 801 |
int32_t ne00, ne01, ne02;
|
| 802 |
+
float scale, max_bias, m0, m1;
|
| 803 |
+
uint32_t n_head_log2;
|
| 804 |
int32_t mask;
|
| 805 |
} pushConsts {
|
| 806 |
safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4),
|
| 807 |
ne00, ne01, ne02,
|
| 808 |
+
scale, max_bias, m0, m1,
|
| 809 |
+
n_head_log2,
|
| 810 |
bool(inB)
|
| 811 |
};
|
| 812 |
|
|
|
|
| 916 |
const std::shared_ptr<kp::Tensor>& out,
|
| 917 |
uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
|
| 918 |
int32_t ne00, int32_t ne01, int32_t ne02,
|
| 919 |
+
uint32_t nb00, uint32_t nb01, uint32_t nb02, uint32_t nb03,
|
| 920 |
int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13,
|
| 921 |
+
uint32_t nb10, uint32_t nb11, uint32_t nb12, uint32_t nb13,
|
| 922 |
int32_t ne0, int32_t ne1,
|
| 923 |
uint32_t r2, uint32_t r3
|
| 924 |
) {
|
|
|
|
| 928 |
struct PushConstants {
|
| 929 |
uint32_t inAOff, inBOff, outOff;
|
| 930 |
int32_t ne00, ne01, ne02;
|
| 931 |
+
uint32_t nb00, nb01, nb02, nb03;
|
| 932 |
int32_t ne10, ne11, ne12;
|
| 933 |
+
uint32_t nb10, nb11, nb12, nb13;
|
| 934 |
int32_t ne0, ne1;
|
| 935 |
uint32_t r2, r3;
|
| 936 |
} pushConsts {
|
| 937 |
safe_divide(inAOff, 2), safe_divide(inBOff, 4), safe_divide(outOff, 4),
|
| 938 |
ne00, ne01, ne02,
|
| 939 |
+
nb00, nb01, nb02, nb03,
|
| 940 |
ne10, ne11, ne12,
|
| 941 |
+
nb10, nb11, nb12, nb13,
|
| 942 |
ne0, ne1,
|
| 943 |
r2, r3
|
| 944 |
};
|
|
|
|
| 1018 |
int32_t ne00, int32_t ne01, int32_t ne02,
|
| 1019 |
int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13,
|
| 1020 |
int32_t ne0, int32_t ne1,
|
| 1021 |
+
uint32_t nb01, uint32_t nb02, uint32_t nb03,
|
| 1022 |
+
uint32_t nb11, uint32_t nb12, uint32_t nb13,
|
| 1023 |
uint32_t r2, uint32_t r3
|
| 1024 |
) {
|
| 1025 |
struct PushConstants {
|
|
|
|
| 1027 |
int32_t ne00, ne01, ne02;
|
| 1028 |
int32_t ne10, ne12;
|
| 1029 |
int32_t ne0, ne1;
|
| 1030 |
+
uint32_t nb01, nb02, nb03;
|
| 1031 |
+
uint32_t nb11, nb12, nb13;
|
| 1032 |
uint32_t r2, r3;
|
| 1033 |
} pushConsts {
|
| 1034 |
safe_divide(inAOff, block_size), safe_divide(inBOff, 4), safe_divide(outOff, 4),
|
| 1035 |
ne00, ne01, ne02,
|
| 1036 |
ne10, ne12,
|
| 1037 |
ne0, ne1,
|
| 1038 |
+
nb01, nb02, nb03,
|
| 1039 |
+
nb11, nb12, nb13,
|
| 1040 |
r2, r3
|
| 1041 |
};
|
| 1042 |
|
| 1043 |
auto name = std::string(__func__) + "_" + suffix;
|
| 1044 |
std::shared_ptr<kp::Algorithm> s_algo = nullptr;
|
| 1045 |
if (!komputeManager()->hasAlgorithm(name)) {
|
| 1046 |
+
const uint32_t local_x = (ggml_vk_current_device().subgroupSize * 2) / 8;
|
| 1047 |
s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(name, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 7)/8), unsigned(ne11), unsigned(ne12*ne13)}, {local_x}, {pushConsts});
|
| 1048 |
} else {
|
| 1049 |
s_algo = komputeManager()->getAlgorithm(name);
|
|
|
|
| 1085 |
const std::shared_ptr<kp::Tensor>& inB,
|
| 1086 |
const std::shared_ptr<kp::Tensor>& out,
|
| 1087 |
uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
|
| 1088 |
+
int32_t ne00, int32_t ne01, int32_t ne02,
|
| 1089 |
+
int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13,
|
| 1090 |
+
int32_t ne0, int32_t ne1,
|
| 1091 |
+
uint32_t nb01, uint32_t nb02, uint32_t nb03,
|
| 1092 |
+
uint32_t nb11, uint32_t nb12, uint32_t nb13,
|
| 1093 |
+
uint32_t r2, uint32_t r3
|
| 1094 |
) {
|
| 1095 |
const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q4_k_comp_spv,
|
| 1096 |
kp::shader_data::op_mul_mat_q4_k_comp_spv_len);
|
| 1097 |
|
| 1098 |
struct PushConstants {
|
| 1099 |
uint32_t inAOff, inBOff, outOff;
|
| 1100 |
+
int32_t ne00, ne10, ne0, ne1, ne01, ne02, ne12;
|
| 1101 |
+
uint32_t nb01, nb02, nb03, nb11, nb12, nb13;
|
| 1102 |
+
uint32_t r2, r3;
|
| 1103 |
} pushConsts {
|
| 1104 |
+
inAOff, safe_divide(inBOff, 4), safe_divide(outOff, 4),
|
| 1105 |
+
ne00, ne10, ne0, ne1, ne01, ne02, ne12,
|
| 1106 |
+
nb01, nb02, nb03, nb11, nb12, nb13,
|
| 1107 |
+
r2, r3
|
| 1108 |
};
|
| 1109 |
|
| 1110 |
std::shared_ptr<kp::Algorithm> s_algo = nullptr;
|
|
|
|
| 1126 |
const std::shared_ptr<kp::Tensor>& inB,
|
| 1127 |
const std::shared_ptr<kp::Tensor>& out,
|
| 1128 |
uint32_t inAOff, uint32_t inBOff, uint32_t outOff,
|
| 1129 |
+
int32_t ne00, int32_t ne01, int32_t ne02,
|
| 1130 |
+
int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13,
|
| 1131 |
+
int32_t ne0, int32_t ne1,
|
| 1132 |
+
uint32_t nb01, uint32_t nb02, uint32_t nb03,
|
| 1133 |
+
uint32_t nb11, uint32_t nb12, uint32_t nb13,
|
| 1134 |
+
uint32_t r2, uint32_t r3
|
| 1135 |
) {
|
| 1136 |
const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q6_k_comp_spv,
|
| 1137 |
kp::shader_data::op_mul_mat_q6_k_comp_spv_len);
|
| 1138 |
|
| 1139 |
struct PushConstants {
|
| 1140 |
uint32_t inAOff, inBOff, outOff;
|
| 1141 |
+
int32_t ne00, ne10, ne0, ne1, ne01, ne02, ne12;
|
| 1142 |
+
uint32_t nb01, nb02, nb03, nb11, nb12, nb13;
|
| 1143 |
+
uint32_t r2, r3;
|
| 1144 |
} pushConsts {
|
| 1145 |
inAOff, safe_divide(inBOff, 4), safe_divide(outOff, 4),
|
| 1146 |
+
ne00, ne10, ne0, ne1, ne01, ne02, ne12,
|
| 1147 |
+
nb01, nb02, nb03, nb11, nb12, nb13,
|
| 1148 |
+
r2, r3
|
| 1149 |
};
|
| 1150 |
|
| 1151 |
std::shared_ptr<kp::Algorithm> s_algo = nullptr;
|
| 1152 |
if (!komputeManager()->hasAlgorithm(__func__)) {
|
| 1153 |
+
const uint32_t local_x = 2;
|
| 1154 |
+
const uint32_t local_y = ggml_vk_current_device().subgroupSize;
|
| 1155 |
+
s_algo = komputeManager()->algorithm<uint32_t, PushConstants>(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)*unsigned(ne13)}, {local_x, local_y}, {pushConsts});
|
| 1156 |
} else {
|
| 1157 |
s_algo = komputeManager()->getAlgorithm(__func__);
|
| 1158 |
s_algo->setTensors({inA, inB, out});
|
| 1159 |
+
s_algo->setWorkgroup({unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)*unsigned(ne13)});
|
| 1160 |
s_algo->setPushConstants<PushConstants>({pushConsts});
|
| 1161 |
s_algo->updateDescriptors(s_kompute_context->pool.get());
|
| 1162 |
}
|
|
|
|
| 1244 |
kp::Sequence& seq,
|
| 1245 |
const std::shared_ptr<kp::Tensor>& inA,
|
| 1246 |
const std::shared_ptr<kp::Tensor>& inB,
|
| 1247 |
+
const std::shared_ptr<kp::Tensor>& inC,
|
| 1248 |
const std::shared_ptr<kp::Tensor>& out,
|
| 1249 |
+
uint32_t inAOff, uint32_t inBOff, uint32_t inCOff, uint32_t outOff,
|
| 1250 |
ggml_type src0t, int32_t n_dims, int32_t mode, int32_t n_ctx_orig,
|
| 1251 |
+
float freq_base, float freq_scale, bool has_freq_factors, float ext_factor, float attn_factor, float beta_fast, float beta_slow,
|
| 1252 |
int32_t ne01, int32_t ne02, int32_t ne03,
|
| 1253 |
uint32_t nb00, uint32_t nb01, uint32_t nb02, uint32_t nb03,
|
| 1254 |
int32_t ne0,
|
|
|
|
| 1256 |
) {
|
| 1257 |
GGML_ASSERT(src0t == GGML_TYPE_F16 || src0t == GGML_TYPE_F32);
|
| 1258 |
|
| 1259 |
+
static const auto spirv_norm_f16 = getSpirvShader(
|
| 1260 |
+
kp::shader_data::op_rope_norm_f16_comp_spv, kp::shader_data::op_rope_norm_f16_comp_spv_len
|
| 1261 |
+
);
|
| 1262 |
+
static const auto spirv_norm_f32 = getSpirvShader(
|
| 1263 |
+
kp::shader_data::op_rope_norm_f32_comp_spv, kp::shader_data::op_rope_norm_f32_comp_spv_len
|
| 1264 |
+
);
|
| 1265 |
+
static const auto spirv_neox_f16 = getSpirvShader(
|
| 1266 |
+
kp::shader_data::op_rope_neox_f16_comp_spv, kp::shader_data::op_rope_neox_f16_comp_spv_len
|
| 1267 |
);
|
| 1268 |
+
static const auto spirv_neox_f32 = getSpirvShader(
|
| 1269 |
+
kp::shader_data::op_rope_neox_f32_comp_spv, kp::shader_data::op_rope_neox_f32_comp_spv_len
|
| 1270 |
);
|
| 1271 |
|
| 1272 |
int type_size = src0t == GGML_TYPE_F16 ? 2 : 4;
|
|
|
|
| 1281 |
GGML_ASSERT(nb0 % type_size == 0);
|
| 1282 |
|
| 1283 |
struct PushConstants {
|
| 1284 |
+
uint32_t inAOff, inBOff, inCOff, outOff;
|
| 1285 |
int32_t n_dims, mode, n_ctx_orig;
|
| 1286 |
+
float freq_base, freq_scale;
|
| 1287 |
+
bool has_freq_factors;
|
| 1288 |
+
float ext_factor, attn_factor, beta_fast, beta_slow;
|
| 1289 |
uint32_t nb00, nb01, nb02, nb03;
|
| 1290 |
int32_t ne0;
|
| 1291 |
uint32_t nb0, nb1, nb2, nb3;
|
| 1292 |
} pushConsts {
|
| 1293 |
+
safe_divide(inAOff, type_size), safe_divide(inBOff, 4), safe_divide(inCOff, type_size), safe_divide(outOff, type_size),
|
| 1294 |
n_dims, mode, n_ctx_orig,
|
| 1295 |
+
freq_base, freq_scale,
|
| 1296 |
+
has_freq_factors,
|
| 1297 |
+
ext_factor, attn_factor, beta_fast, beta_slow,
|
| 1298 |
nb00, nb01, nb02, nb03,
|
| 1299 |
ne0,
|
| 1300 |
nb0, nb1, nb2, nb3
|
| 1301 |
};
|
| 1302 |
|
| 1303 |
+
auto & inC_ = inC ? inC : inA;
|
| 1304 |
+
const bool is_neox = mode & GGML_ROPE_TYPE_NEOX;
|
| 1305 |
+
const bool is_f16 = src0t == GGML_TYPE_F16;
|
| 1306 |
+
|
| 1307 |
+
auto name = std::string(__func__) + (is_neox ? "_neox" : "_norm") + (src0t == GGML_TYPE_F16 ? "_f16" : "_f32");
|
| 1308 |
std::shared_ptr<kp::Algorithm> s_algo = nullptr;
|
| 1309 |
if (!komputeManager()->hasAlgorithm(name)) {
|
| 1310 |
+
auto & spirv = is_neox ? is_f16 ? spirv_neox_f16 : spirv_neox_f32 : is_f16 ? spirv_norm_f16 : spirv_norm_f32;
|
| 1311 |
s_algo = komputeManager()->algorithm<float, PushConstants>(
|
| 1312 |
+
name, s_kompute_context->pool.get(), {inA, inB, inC_, out}, spirv,
|
|
|
|
| 1313 |
{unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts}
|
| 1314 |
);
|
| 1315 |
} else {
|
| 1316 |
s_algo = komputeManager()->getAlgorithm(name);
|
| 1317 |
+
s_algo->setTensors({inA, inB, inC_, out});
|
| 1318 |
s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)});
|
| 1319 |
s_algo->setPushConstants<PushConstants>({pushConsts});
|
| 1320 |
s_algo->updateDescriptors(s_kompute_context->pool.get());
|
|
|
|
| 1393 |
}
|
| 1394 |
|
| 1395 |
static bool ggml_backend_kompute_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
|
| 1396 |
+
int64_t n = ggml_nelements(op);
|
| 1397 |
switch (op->op) {
|
| 1398 |
case GGML_OP_UNARY:
|
| 1399 |
+
if (n % 4 != 0) return false;
|
| 1400 |
switch (ggml_get_unary_op(op)) {
|
|
|
|
| 1401 |
case GGML_UNARY_OP_GELU:
|
| 1402 |
+
if (n % 8 != 0) return false;
|
| 1403 |
+
// fall through
|
| 1404 |
+
case GGML_UNARY_OP_RELU:
|
| 1405 |
case GGML_UNARY_OP_SILU:
|
| 1406 |
return ggml_is_contiguous(op->src[0]);
|
| 1407 |
default:
|
|
|
|
| 1459 |
|
| 1460 |
switch (op->src[0]->type) {
|
| 1461 |
case GGML_TYPE_F32:
|
|
|
|
| 1462 |
return op->ne[3] == 1;
|
| 1463 |
+
case GGML_TYPE_Q6_K:
|
| 1464 |
case GGML_TYPE_F16:
|
| 1465 |
case GGML_TYPE_Q8_0:
|
| 1466 |
case GGML_TYPE_Q4_0:
|
|
|
|
| 1561 |
const static std::shared_ptr<kp::Tensor> nullTensor = nullptr;
|
| 1562 |
uint32_t off_src0 = 0;
|
| 1563 |
uint32_t off_src1 = 0;
|
| 1564 |
+
uint32_t off_src2 = 0;
|
| 1565 |
uint32_t off_dst = 0;
|
| 1566 |
const std::shared_ptr<kp::Tensor>& id_src0 = src0 ? ggml_vk_get_tensor(src0, &off_src0) : nullTensor;
|
| 1567 |
const std::shared_ptr<kp::Tensor>& id_src1 = src1 ? ggml_vk_get_tensor(src1, &off_src1) : nullTensor;
|
| 1568 |
+
const std::shared_ptr<kp::Tensor>& id_src2 = src2 ? ggml_vk_get_tensor(src2, &off_src2) : nullTensor;
|
| 1569 |
const std::shared_ptr<kp::Tensor>& id_dst = dst ? ggml_vk_get_tensor(dst, &off_dst) : nullTensor;
|
| 1570 |
|
| 1571 |
switch (dst->op) {
|
|
|
|
| 1641 |
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5021")
|
| 1642 |
GGML_ASSERT(!src1 || src1t == GGML_TYPE_F32);
|
| 1643 |
|
| 1644 |
+
const int64_t nrows_x = ggml_nrows(src0);
|
| 1645 |
+
const int64_t nrows_y = src0->ne[1];
|
| 1646 |
+
|
| 1647 |
+
const uint32_t n_head = nrows_x/nrows_y;
|
| 1648 |
+
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
|
| 1649 |
|
| 1650 |
+
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
| 1651 |
+
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
| 1652 |
+
|
| 1653 |
+
ggml_vk_soft_max(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, ne02, ne03, scale, max_bias, m0, m1, n_head_log2);
|
| 1654 |
} break;
|
| 1655 |
case GGML_OP_DIAG_MASK_INF:
|
| 1656 |
{
|
|
|
|
| 1702 |
case GGML_TYPE_F16:
|
| 1703 |
ggml_vk_mul_mat_f16(
|
| 1704 |
seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
|
| 1705 |
+
ne00, ne01, ne02, nb00, nb01, nb02, nb03,
|
| 1706 |
+
ne10, ne11, ne12, ne13, nb10, nb11, nb12, nb13,
|
| 1707 |
ne0, ne1, r2, r3
|
| 1708 |
);
|
| 1709 |
break;
|
| 1710 |
case GGML_TYPE_Q8_0:
|
| 1711 |
ggml_vk_mul_mat_q8_0(
|
| 1712 |
seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
|
| 1713 |
+
ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1,
|
| 1714 |
+
nb01, nb02, nb03, nb11, nb12, nb13, r2, r3
|
| 1715 |
);
|
| 1716 |
break;
|
| 1717 |
case GGML_TYPE_Q4_0:
|
| 1718 |
ggml_vk_mul_mat_q4_0(
|
| 1719 |
seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
|
| 1720 |
+
ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1,
|
| 1721 |
+
nb01, nb02, nb03, nb11, nb12, nb13, r2, r3
|
| 1722 |
);
|
| 1723 |
break;
|
| 1724 |
case GGML_TYPE_Q4_1:
|
| 1725 |
ggml_vk_mul_mat_q4_1(
|
| 1726 |
seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
|
| 1727 |
+
ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1,
|
| 1728 |
+
nb01, nb02, nb03, nb11, nb12, nb13, r2, r3
|
| 1729 |
);
|
| 1730 |
break;
|
| 1731 |
case GGML_TYPE_Q4_K:
|
| 1732 |
ggml_vk_mul_mat_q4_k(
|
| 1733 |
seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
|
| 1734 |
+
ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1,
|
| 1735 |
+
nb01, nb02, nb03, nb11, nb12, nb13, r2, r3
|
| 1736 |
);
|
| 1737 |
break;
|
| 1738 |
case GGML_TYPE_Q6_K:
|
| 1739 |
ggml_vk_mul_mat_q6_k(
|
| 1740 |
seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst,
|
| 1741 |
+
ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1,
|
| 1742 |
+
nb01, nb02, nb03, nb11, nb12, nb13, r2, r3
|
| 1743 |
);
|
| 1744 |
break;
|
| 1745 |
default: {
|
|
|
|
| 1768 |
} break;
|
| 1769 |
case GGML_OP_ROPE:
|
| 1770 |
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1771 |
GGML_ASSERT(ne10 == ne02);
|
| 1772 |
GGML_ASSERT(src0t == dstt);
|
| 1773 |
// const int n_past = ((int32_t *) dst->op_params)[0];
|
|
|
|
| 1776 |
// skip 3, n_ctx used in GLM RoPE, unimplemented in Vulkan
|
| 1777 |
const int n_ctx_orig = ((int32_t *) dst->op_params)[4];
|
| 1778 |
|
| 1779 |
+
const bool has_freq_factors = dst->src[2] != nullptr;
|
| 1780 |
+
|
| 1781 |
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
|
| 1782 |
memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float));
|
| 1783 |
memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float));
|
|
|
|
| 1786 |
memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
|
| 1787 |
memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
|
| 1788 |
ggml_vk_rope(
|
| 1789 |
+
seq, id_src0, id_src1, id_src2, id_dst, off_src0, off_src1, off_src2, off_dst, src0t, n_dims, mode, n_ctx_orig,
|
| 1790 |
+
freq_base, freq_scale, has_freq_factors, ext_factor, attn_factor, beta_fast, beta_slow,
|
| 1791 |
ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, nb0, nb1, nb2, nb3
|
| 1792 |
);
|
| 1793 |
} break;
|
ggml/src/ggml-kompute/kompute-shaders/common.comp
CHANGED
|
@@ -3,6 +3,7 @@
|
|
| 3 |
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
|
| 4 |
#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
|
| 5 |
#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
|
|
|
|
| 6 |
#extension GL_EXT_control_flow_attributes: enable
|
| 7 |
#extension GL_KHR_shader_subgroup_arithmetic : require
|
| 8 |
#extension GL_EXT_debug_printf : enable
|
|
|
|
| 3 |
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
|
| 4 |
#extension GL_EXT_shader_explicit_arithmetic_types_int8: require
|
| 5 |
#extension GL_EXT_shader_explicit_arithmetic_types_int16: require
|
| 6 |
+
#extension GL_EXT_shader_explicit_arithmetic_types_int64: require
|
| 7 |
#extension GL_EXT_control_flow_attributes: enable
|
| 8 |
#extension GL_KHR_shader_subgroup_arithmetic : require
|
| 9 |
#extension GL_EXT_debug_printf : enable
|
ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp
CHANGED
|
@@ -20,12 +20,14 @@ layout (push_constant) uniform parameter {
|
|
| 20 |
uint nb00;
|
| 21 |
uint nb01;
|
| 22 |
uint nb02;
|
|
|
|
| 23 |
int ne10;
|
| 24 |
int ne11;
|
| 25 |
int ne12;
|
| 26 |
uint nb10;
|
| 27 |
uint nb11;
|
| 28 |
uint nb12;
|
|
|
|
| 29 |
int ne0;
|
| 30 |
int ne1;
|
| 31 |
uint r2;
|
|
@@ -42,7 +44,7 @@ void main() {
|
|
| 42 |
const uint i12 = im%pcs.ne12;
|
| 43 |
const uint i13 = im/pcs.ne12;
|
| 44 |
|
| 45 |
-
const uint offset0 = r0*pcs.nb01 + (i12/pcs.r2)*pcs.nb02 + (i13/pcs.r3)*pcs.
|
| 46 |
|
| 47 |
const uint x = offset0 / 2 + pcs.inAOff; // Based from inA
|
| 48 |
|
|
@@ -52,7 +54,7 @@ void main() {
|
|
| 52 |
break;
|
| 53 |
}
|
| 54 |
|
| 55 |
-
const uint y = (r1*pcs.nb11 +
|
| 56 |
|
| 57 |
float sumf = 0;
|
| 58 |
for (uint i = gl_SubgroupInvocationID.x; i < pcs.ne00; i += gl_SubgroupSize) {
|
|
|
|
| 20 |
uint nb00;
|
| 21 |
uint nb01;
|
| 22 |
uint nb02;
|
| 23 |
+
uint nb03;
|
| 24 |
int ne10;
|
| 25 |
int ne11;
|
| 26 |
int ne12;
|
| 27 |
uint nb10;
|
| 28 |
uint nb11;
|
| 29 |
uint nb12;
|
| 30 |
+
uint nb13;
|
| 31 |
int ne0;
|
| 32 |
int ne1;
|
| 33 |
uint r2;
|
|
|
|
| 44 |
const uint i12 = im%pcs.ne12;
|
| 45 |
const uint i13 = im/pcs.ne12;
|
| 46 |
|
| 47 |
+
const uint offset0 = r0*pcs.nb01 + (i12/pcs.r2)*pcs.nb02 + (i13/pcs.r3)*pcs.nb03;
|
| 48 |
|
| 49 |
const uint x = offset0 / 2 + pcs.inAOff; // Based from inA
|
| 50 |
|
|
|
|
| 54 |
break;
|
| 55 |
}
|
| 56 |
|
| 57 |
+
const uint y = (r1*pcs.nb11 + i12*pcs.nb12 + i13*pcs.nb13) / 4 + pcs.inBOff;
|
| 58 |
|
| 59 |
float sumf = 0;
|
| 60 |
for (uint i = gl_SubgroupInvocationID.x; i < pcs.ne00; i += gl_SubgroupSize) {
|
ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp
CHANGED
|
@@ -24,8 +24,14 @@ layout (push_constant) uniform parameter {
|
|
| 24 |
int ne01;
|
| 25 |
int ne02;
|
| 26 |
int ne12;
|
| 27 |
-
|
| 28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
} pcs;
|
| 30 |
|
| 31 |
void main() {
|
|
@@ -50,10 +56,11 @@ void main() {
|
|
| 50 |
const uint i12 = im%pcs.ne12;
|
| 51 |
const uint i13 = im/pcs.ne12;
|
| 52 |
|
| 53 |
-
const uint offset0 = (i12/pcs.r2)*(
|
|
|
|
| 54 |
|
| 55 |
-
const uint xblk =
|
| 56 |
-
const uint y =
|
| 57 |
|
| 58 |
float yl[16];
|
| 59 |
float yh[16];
|
|
@@ -74,7 +81,7 @@ void main() {
|
|
| 74 |
}
|
| 75 |
|
| 76 |
for (int row = 0; row < N_DST; row++) {
|
| 77 |
-
uint row_idx = row *
|
| 78 |
|
| 79 |
uint16_t sc_0 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 0);
|
| 80 |
uint16_t sc_1 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 2);
|
|
|
|
| 24 |
int ne01;
|
| 25 |
int ne02;
|
| 26 |
int ne12;
|
| 27 |
+
uint nb01;
|
| 28 |
+
uint nb02;
|
| 29 |
+
uint nb03;
|
| 30 |
+
uint nb11;
|
| 31 |
+
uint nb12;
|
| 32 |
+
uint nb13;
|
| 33 |
+
uint r2;
|
| 34 |
+
uint r3;
|
| 35 |
} pcs;
|
| 36 |
|
| 37 |
void main() {
|
|
|
|
| 56 |
const uint i12 = im%pcs.ne12;
|
| 57 |
const uint i13 = im/pcs.ne12;
|
| 58 |
|
| 59 |
+
const uint offset0 = first_row*(pcs.nb01/SIZE_OF_BLOCK) + (i12/pcs.r2)*(pcs.nb02/SIZE_OF_BLOCK) + (i13/pcs.r3)*(pcs.nb03/SIZE_OF_BLOCK);
|
| 60 |
+
const uint offset1 = r1*pcs.nb11 + (i12 )*pcs.nb12 + (i13 )*pcs.nb13;
|
| 61 |
|
| 62 |
+
const uint xblk = offset0 + pcs.inAOff;
|
| 63 |
+
const uint y = (offset1 / 4) + pcs.inBOff;
|
| 64 |
|
| 65 |
float yl[16];
|
| 66 |
float yh[16];
|
|
|
|
| 81 |
}
|
| 82 |
|
| 83 |
for (int row = 0; row < N_DST; row++) {
|
| 84 |
+
uint row_idx = row * (pcs.nb01 / SIZE_OF_BLOCK);
|
| 85 |
|
| 86 |
uint16_t sc_0 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 0);
|
| 87 |
uint16_t sc_1 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 2);
|
ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp
CHANGED
|
@@ -21,7 +21,16 @@ layout (push_constant) uniform parameter {
|
|
| 21 |
int ne0;
|
| 22 |
int ne1;
|
| 23 |
int ne01;
|
| 24 |
-
int
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
} pcs;
|
| 26 |
|
| 27 |
void main() {
|
|
@@ -34,12 +43,15 @@ void main() {
|
|
| 34 |
|
| 35 |
const uint r0 = gl_WorkGroupID.x;
|
| 36 |
const uint r1 = gl_WorkGroupID.y;
|
| 37 |
-
const uint
|
| 38 |
|
| 39 |
const uint row = (r0 * gl_NumSubgroups + gl_SubgroupID);
|
| 40 |
-
|
| 41 |
-
const uint
|
| 42 |
-
const uint
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
float sumf = 0;
|
| 45 |
|
|
@@ -89,6 +101,6 @@ void main() {
|
|
| 89 |
|
| 90 |
const float tot = subgroupAdd(sumf);
|
| 91 |
if (subgroupElect()) {
|
| 92 |
-
out_[r1*pcs.ne0 +
|
| 93 |
}
|
| 94 |
}
|
|
|
|
| 21 |
int ne0;
|
| 22 |
int ne1;
|
| 23 |
int ne01;
|
| 24 |
+
int ne02;
|
| 25 |
+
int ne12;
|
| 26 |
+
uint nb01;
|
| 27 |
+
uint nb02;
|
| 28 |
+
uint nb03;
|
| 29 |
+
uint nb11;
|
| 30 |
+
uint nb12;
|
| 31 |
+
uint nb13;
|
| 32 |
+
uint r2;
|
| 33 |
+
uint r3;
|
| 34 |
} pcs;
|
| 35 |
|
| 36 |
void main() {
|
|
|
|
| 43 |
|
| 44 |
const uint r0 = gl_WorkGroupID.x;
|
| 45 |
const uint r1 = gl_WorkGroupID.y;
|
| 46 |
+
const uint im = gl_WorkGroupID.z;
|
| 47 |
|
| 48 |
const uint row = (r0 * gl_NumSubgroups + gl_SubgroupID);
|
| 49 |
+
|
| 50 |
+
const uint i12 = im%pcs.ne12;
|
| 51 |
+
const uint i13 = im/pcs.ne12;
|
| 52 |
+
|
| 53 |
+
const uint x = row*(pcs.nb01/SIZE_OF_BLOCK) + (i12/pcs.r2)*(pcs.nb02/SIZE_OF_BLOCK) + (i13/pcs.r3)*(pcs.nb03/SIZE_OF_BLOCK);
|
| 54 |
+
const uint yy = (r1*pcs.nb11 + i12*pcs.nb12 + i13*pcs.nb13) / 4 + pcs.inBOff;
|
| 55 |
|
| 56 |
float sumf = 0;
|
| 57 |
|
|
|
|
| 101 |
|
| 102 |
const float tot = subgroupAdd(sumf);
|
| 103 |
if (subgroupElect()) {
|
| 104 |
+
out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + row + pcs.outOff] = tot;
|
| 105 |
}
|
| 106 |
}
|
ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp
CHANGED
|
@@ -14,10 +14,15 @@ void main() {
|
|
| 14 |
const uint i12 = im%pcs.ne12;
|
| 15 |
const uint i13 = im/pcs.ne12;
|
| 16 |
|
| 17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
-
const uint
|
| 20 |
-
const uint y = r1*uint(pcs.ne10)+im*pcs.ne00*pcs.ne1+pcs.inBOff; // Based from inB
|
| 21 |
|
| 22 |
float sumf[N_ROWS] = {0.0f, 0.0f, 0.0f, 0.0f};
|
| 23 |
|
|
@@ -32,8 +37,7 @@ void main() {
|
|
| 32 |
|
| 33 |
for (uint ib = ix; ib < nb; ib += 16) {
|
| 34 |
for (int row = 0; row < N_ROWS; row++) {
|
| 35 |
-
|
| 36 |
-
sumf[row] += block_q_n_dot_y(block_index, yb, il);
|
| 37 |
}
|
| 38 |
|
| 39 |
yb += BLOCKS_IN_QUANT * 16;
|
|
|
|
| 14 |
const uint i12 = im%pcs.ne12;
|
| 15 |
const uint i13 = im/pcs.ne12;
|
| 16 |
|
| 17 |
+
// pointers to src0 rows
|
| 18 |
+
uint ax[N_ROWS];
|
| 19 |
+
for (int row = 0; row < N_ROWS; ++row) {
|
| 20 |
+
const uint offset0 = (first_row + row)*(pcs.nb01/SIZE_OF_BLOCK) + (i12/pcs.r2)*(pcs.nb02/SIZE_OF_BLOCK) + (i13/pcs.r3)*(pcs.nb03/SIZE_OF_BLOCK);
|
| 21 |
+
|
| 22 |
+
ax[row] = offset0 + pcs.inAOff;
|
| 23 |
+
}
|
| 24 |
|
| 25 |
+
const uint y = (r1*pcs.nb11 + i12*pcs.nb12 + i13*pcs.nb13) / 4 + pcs.inBOff;
|
|
|
|
| 26 |
|
| 27 |
float sumf[N_ROWS] = {0.0f, 0.0f, 0.0f, 0.0f};
|
| 28 |
|
|
|
|
| 37 |
|
| 38 |
for (uint ib = ix; ib < nb; ib += 16) {
|
| 39 |
for (int row = 0; row < N_ROWS; row++) {
|
| 40 |
+
sumf[row] += block_q_n_dot_y(ax[row] + ib, yb, il);
|
|
|
|
| 41 |
}
|
| 42 |
|
| 43 |
yb += BLOCKS_IN_QUANT * 16;
|
ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
layout(local_size_x_id = 0) in;
|
| 2 |
-
layout(local_size_y =
|
| 3 |
layout(local_size_z = 1) in;
|
| 4 |
|
| 5 |
layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
|
|
@@ -17,6 +17,12 @@ layout (push_constant) uniform parameter {
|
|
| 17 |
int ne12;
|
| 18 |
int ne0;
|
| 19 |
int ne1;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
uint r2;
|
| 21 |
uint r3;
|
| 22 |
} pcs;
|
|
|
|
| 1 |
layout(local_size_x_id = 0) in;
|
| 2 |
+
layout(local_size_y = 8) in;
|
| 3 |
layout(local_size_z = 1) in;
|
| 4 |
|
| 5 |
layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; };
|
|
|
|
| 17 |
int ne12;
|
| 18 |
int ne0;
|
| 19 |
int ne1;
|
| 20 |
+
uint nb01;
|
| 21 |
+
uint nb02;
|
| 22 |
+
uint nb03;
|
| 23 |
+
uint nb11;
|
| 24 |
+
uint nb12;
|
| 25 |
+
uint nb13;
|
| 26 |
uint r2;
|
| 27 |
uint r3;
|
| 28 |
} pcs;
|
ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#version 450
|
| 2 |
+
|
| 3 |
+
#include "rope_common.comp"
|
| 4 |
+
|
| 5 |
+
layout(binding = 0) buffer restrict readonly tensorInA { float16_t inA[]; };
|
| 6 |
+
layout(binding = 1) buffer restrict readonly tensorInB { int inB[]; };
|
| 7 |
+
layout(binding = 2) buffer restrict readonly tensorInC { float inC[]; };
|
| 8 |
+
layout(binding = 3) buffer restrict writeonly tensorOut { float16_t out_[]; };
|
| 9 |
+
|
| 10 |
+
void main() {
|
| 11 |
+
const uint i3 = gl_WorkGroupID.z;
|
| 12 |
+
const uint i2 = gl_WorkGroupID.y;
|
| 13 |
+
const uint i1 = gl_WorkGroupID.x;
|
| 14 |
+
|
| 15 |
+
float corr_dims[2];
|
| 16 |
+
rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
|
| 17 |
+
|
| 18 |
+
const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
|
| 19 |
+
|
| 20 |
+
float theta_base = float(inB[pcs.inBOff + i2]);
|
| 21 |
+
float inv_ndims = -1.f/pcs.n_dims;
|
| 22 |
+
|
| 23 |
+
float cos_theta;
|
| 24 |
+
float sin_theta;
|
| 25 |
+
|
| 26 |
+
for (uint i0 = 2*gl_LocalInvocationIndex; i0 < pcs.ne0; i0 += 2*gl_WorkGroupSize.x) {
|
| 27 |
+
if (i0 < pcs.n_dims) {
|
| 28 |
+
uint ic = i0/2;
|
| 29 |
+
|
| 30 |
+
float theta = theta_base * pow(pcs.freq_base, inv_ndims*i0);
|
| 31 |
+
|
| 32 |
+
const float freq_factor = pcs.has_freq_factors ? inC[pcs.inCOff + ic] : 1.0f;
|
| 33 |
+
|
| 34 |
+
rope_yarn(theta/freq_factor, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
|
| 35 |
+
|
| 36 |
+
const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + ic*pcs.nb00) / 2) + pcs.inAOff; // Based from in
|
| 37 |
+
const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + ic*pcs.nb0) / 2) + pcs.outOff; // Based from out_
|
| 38 |
+
|
| 39 |
+
const float x0 = float(inA[src]);
|
| 40 |
+
const float x1 = float(inA[src+pcs.n_dims/2]);
|
| 41 |
+
|
| 42 |
+
out_[dst_data] = float16_t(x0*cos_theta - x1*sin_theta);
|
| 43 |
+
out_[dst_data+pcs.n_dims/2] = float16_t(x0*sin_theta + x1*cos_theta);
|
| 44 |
+
} else {
|
| 45 |
+
const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
|
| 46 |
+
const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 2) + pcs.outOff; // Based from out_
|
| 47 |
+
|
| 48 |
+
out_[dst_data] = inA[src];
|
| 49 |
+
out_[dst_data+1] = inA[src+1];
|
| 50 |
+
}
|
| 51 |
+
}
|
| 52 |
+
}
|
ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#version 450
|
| 2 |
+
|
| 3 |
+
#include "rope_common.comp"
|
| 4 |
+
|
| 5 |
+
layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
|
| 6 |
+
layout(binding = 1) buffer restrict readonly tensorInB { int inB[]; };
|
| 7 |
+
layout(binding = 2) buffer restrict readonly tensorInC { float inC[]; };
|
| 8 |
+
layout(binding = 3) buffer restrict writeonly tensorOut { float out_[]; };
|
| 9 |
+
|
| 10 |
+
void main() {
|
| 11 |
+
const uint i3 = gl_WorkGroupID.z;
|
| 12 |
+
const uint i2 = gl_WorkGroupID.y;
|
| 13 |
+
const uint i1 = gl_WorkGroupID.x;
|
| 14 |
+
|
| 15 |
+
float corr_dims[2];
|
| 16 |
+
rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
|
| 17 |
+
|
| 18 |
+
const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
|
| 19 |
+
|
| 20 |
+
float theta_base = float(inB[pcs.inBOff + i2]);
|
| 21 |
+
float inv_ndims = -1.f/pcs.n_dims;
|
| 22 |
+
|
| 23 |
+
float cos_theta;
|
| 24 |
+
float sin_theta;
|
| 25 |
+
|
| 26 |
+
for (uint i0 = 2*gl_LocalInvocationIndex; i0 < pcs.ne0; i0 += 2*gl_WorkGroupSize.x) {
|
| 27 |
+
if (i0 < pcs.n_dims) {
|
| 28 |
+
uint ic = i0/2;
|
| 29 |
+
|
| 30 |
+
float theta = theta_base * pow(pcs.freq_base, inv_ndims*i0);
|
| 31 |
+
|
| 32 |
+
const float freq_factor = pcs.has_freq_factors ? inC[pcs.inCOff + ic] : 1.0f;
|
| 33 |
+
|
| 34 |
+
rope_yarn(theta/freq_factor, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
|
| 35 |
+
|
| 36 |
+
const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + ic*pcs.nb00) / 4) + pcs.inAOff; // Based from in
|
| 37 |
+
const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + ic*pcs.nb0) / 4) + pcs.outOff; // Based from out_
|
| 38 |
+
|
| 39 |
+
const float x0 = inA[src];
|
| 40 |
+
const float x1 = inA[src+pcs.n_dims/2];
|
| 41 |
+
|
| 42 |
+
out_[dst_data] = x0*cos_theta - x1*sin_theta;
|
| 43 |
+
out_[dst_data+pcs.n_dims/2] = x0*sin_theta + x1*cos_theta;
|
| 44 |
+
} else {
|
| 45 |
+
const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
|
| 46 |
+
const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_
|
| 47 |
+
|
| 48 |
+
out_[dst_data] = inA[src];
|
| 49 |
+
out_[dst_data+1] = inA[src+1];
|
| 50 |
+
}
|
| 51 |
+
}
|
| 52 |
+
}
|
ggml/src/ggml-kompute/kompute-shaders/{op_rope_f16.comp → op_rope_norm_f16.comp}
RENAMED
|
@@ -4,30 +4,34 @@
|
|
| 4 |
|
| 5 |
layout(binding = 0) buffer restrict readonly tensorInA { float16_t inA[]; };
|
| 6 |
layout(binding = 1) buffer restrict readonly tensorInB { int inB[]; };
|
| 7 |
-
layout(binding = 2) buffer restrict
|
|
|
|
| 8 |
|
| 9 |
void main() {
|
| 10 |
const uint i3 = gl_WorkGroupID.z;
|
| 11 |
const uint i2 = gl_WorkGroupID.y;
|
| 12 |
const uint i1 = gl_WorkGroupID.x;
|
| 13 |
|
| 14 |
-
const bool is_neox = (pcs.mode & GGML_ROPE_TYPE_NEOX) != 0;
|
| 15 |
-
|
| 16 |
float corr_dims[2];
|
| 17 |
rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
|
| 18 |
|
| 19 |
const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
|
| 20 |
|
| 21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
-
|
|
|
|
|
|
|
| 24 |
|
| 25 |
-
|
| 26 |
-
for (uint i0 = 0; i0 < pcs.ne0; i0 += 2) {
|
| 27 |
-
float cos_theta, sin_theta;
|
| 28 |
-
rope_yarn(theta, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
|
| 29 |
|
| 30 |
-
|
|
|
|
|
|
|
| 31 |
|
| 32 |
const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
|
| 33 |
const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 2) + pcs.outOff; // Based from out_
|
|
@@ -37,37 +41,12 @@ void main() {
|
|
| 37 |
|
| 38 |
out_[dst_data] = float16_t(x0*cos_theta - x1*sin_theta);
|
| 39 |
out_[dst_data+1] = float16_t(x0*sin_theta + x1*cos_theta);
|
| 40 |
-
}
|
| 41 |
-
} else {
|
| 42 |
-
const float inv_ndims = -1.f/pcs.n_dims;
|
| 43 |
-
for (uint ic = 0; ic < pcs.n_dims; ic += 2) {
|
| 44 |
-
const uint cur_rot = ic;
|
| 45 |
-
|
| 46 |
-
float cos_theta, sin_theta;
|
| 47 |
-
rope_yarn(theta, pcs.freq_scale, corr_dims, cur_rot, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
|
| 48 |
-
|
| 49 |
-
theta *= theta_scale;
|
| 50 |
-
|
| 51 |
-
const uint i0 = ic/2;
|
| 52 |
-
|
| 53 |
-
const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
|
| 54 |
-
const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 2) + pcs.outOff; // Based from out_
|
| 55 |
-
|
| 56 |
-
const float x0 = float(inA[src]);
|
| 57 |
-
const float x1 = float(inA[src+pcs.n_dims/2]);
|
| 58 |
-
|
| 59 |
-
out_[dst_data] = float16_t(x0*cos_theta - x1*sin_theta);
|
| 60 |
-
out_[dst_data+pcs.n_dims/2] = float16_t(x0*sin_theta + x1*cos_theta);
|
| 61 |
-
}
|
| 62 |
-
|
| 63 |
-
for (uint ic = pcs.n_dims; ic < pcs.ne0; ic += 2) {
|
| 64 |
-
const uint i0 = ic;
|
| 65 |
-
|
| 66 |
const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
|
| 67 |
const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 2) + pcs.outOff; // Based from out_
|
| 68 |
|
| 69 |
-
out_[dst_data
|
| 70 |
-
out_[dst_data
|
| 71 |
}
|
| 72 |
}
|
| 73 |
}
|
|
|
|
| 4 |
|
| 5 |
layout(binding = 0) buffer restrict readonly tensorInA { float16_t inA[]; };
|
| 6 |
layout(binding = 1) buffer restrict readonly tensorInB { int inB[]; };
|
| 7 |
+
layout(binding = 2) buffer restrict readonly tensorInC { float inC[]; };
|
| 8 |
+
layout(binding = 3) buffer restrict writeonly tensorOut { float16_t out_[]; };
|
| 9 |
|
| 10 |
void main() {
|
| 11 |
const uint i3 = gl_WorkGroupID.z;
|
| 12 |
const uint i2 = gl_WorkGroupID.y;
|
| 13 |
const uint i1 = gl_WorkGroupID.x;
|
| 14 |
|
|
|
|
|
|
|
| 15 |
float corr_dims[2];
|
| 16 |
rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
|
| 17 |
|
| 18 |
const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
|
| 19 |
|
| 20 |
+
float theta_base = float(inB[pcs.inBOff + i2]);
|
| 21 |
+
float inv_ndims = -1.f/pcs.n_dims;
|
| 22 |
+
|
| 23 |
+
float cos_theta;
|
| 24 |
+
float sin_theta;
|
| 25 |
|
| 26 |
+
for (uint i0 = 2*gl_LocalInvocationIndex; i0 < pcs.ne0; i0 += 2*gl_WorkGroupSize.x) {
|
| 27 |
+
if (i0 < pcs.n_dims) {
|
| 28 |
+
uint ic = i0/2;
|
| 29 |
|
| 30 |
+
float theta = theta_base * pow(pcs.freq_base, inv_ndims*i0);
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
+
const float freq_factor = pcs.has_freq_factors ? inC[pcs.inCOff + ic] : 1.0f;
|
| 33 |
+
|
| 34 |
+
rope_yarn(theta/freq_factor, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
|
| 35 |
|
| 36 |
const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
|
| 37 |
const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 2) + pcs.outOff; // Based from out_
|
|
|
|
| 41 |
|
| 42 |
out_[dst_data] = float16_t(x0*cos_theta - x1*sin_theta);
|
| 43 |
out_[dst_data+1] = float16_t(x0*sin_theta + x1*cos_theta);
|
| 44 |
+
} else {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in
|
| 46 |
const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 2) + pcs.outOff; // Based from out_
|
| 47 |
|
| 48 |
+
out_[dst_data] = inA[src];
|
| 49 |
+
out_[dst_data+1] = inA[src+1];
|
| 50 |
}
|
| 51 |
}
|
| 52 |
}
|
ggml/src/ggml-kompute/kompute-shaders/{op_rope_f32.comp → op_rope_norm_f32.comp}
RENAMED
|
@@ -4,30 +4,34 @@
|
|
| 4 |
|
| 5 |
layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
|
| 6 |
layout(binding = 1) buffer restrict readonly tensorInB { int inB[]; };
|
| 7 |
-
layout(binding = 2) buffer restrict
|
|
|
|
| 8 |
|
| 9 |
void main() {
|
| 10 |
const uint i3 = gl_WorkGroupID.z;
|
| 11 |
const uint i2 = gl_WorkGroupID.y;
|
| 12 |
const uint i1 = gl_WorkGroupID.x;
|
| 13 |
|
| 14 |
-
const bool is_neox = (pcs.mode & GGML_ROPE_TYPE_NEOX) != 0;
|
| 15 |
-
|
| 16 |
float corr_dims[2];
|
| 17 |
rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
|
| 18 |
|
| 19 |
const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
|
| 20 |
|
| 21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
-
|
|
|
|
|
|
|
| 24 |
|
| 25 |
-
|
| 26 |
-
for (uint i0 = 0; i0 < pcs.ne0; i0 += 2) {
|
| 27 |
-
float cos_theta, sin_theta;
|
| 28 |
-
rope_yarn(theta, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
|
| 29 |
|
| 30 |
-
|
|
|
|
|
|
|
| 31 |
|
| 32 |
const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
|
| 33 |
const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_
|
|
@@ -37,37 +41,12 @@ void main() {
|
|
| 37 |
|
| 38 |
out_[dst_data] = x0*cos_theta - x1*sin_theta;
|
| 39 |
out_[dst_data+1] = x0*sin_theta + x1*cos_theta;
|
| 40 |
-
}
|
| 41 |
-
} else {
|
| 42 |
-
const float inv_ndims = -1.f/pcs.n_dims;
|
| 43 |
-
for (uint ic = 0; ic < pcs.n_dims; ic += 2) {
|
| 44 |
-
const uint cur_rot = ic;
|
| 45 |
-
|
| 46 |
-
float cos_theta, sin_theta;
|
| 47 |
-
rope_yarn(theta, pcs.freq_scale, corr_dims, cur_rot, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
|
| 48 |
-
|
| 49 |
-
theta *= theta_scale;
|
| 50 |
-
|
| 51 |
-
const uint i0 = ic/2;
|
| 52 |
-
|
| 53 |
const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
|
| 54 |
const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_
|
| 55 |
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
out_[dst_data] = x0*cos_theta - x1*sin_theta;
|
| 60 |
-
out_[dst_data+pcs.n_dims/2] = x0*sin_theta + x1*cos_theta;
|
| 61 |
-
}
|
| 62 |
-
|
| 63 |
-
for (uint ic = pcs.n_dims; ic < pcs.ne0; ic += 2) {
|
| 64 |
-
const uint i0 = ic;
|
| 65 |
-
|
| 66 |
-
const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
|
| 67 |
-
const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_
|
| 68 |
-
|
| 69 |
-
out_[dst_data + 0] = inA[src + 0];
|
| 70 |
-
out_[dst_data + 1] = inA[src + 1];
|
| 71 |
}
|
| 72 |
}
|
| 73 |
}
|
|
|
|
| 4 |
|
| 5 |
layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; };
|
| 6 |
layout(binding = 1) buffer restrict readonly tensorInB { int inB[]; };
|
| 7 |
+
layout(binding = 2) buffer restrict readonly tensorInC { float inC[]; };
|
| 8 |
+
layout(binding = 3) buffer restrict writeonly tensorOut { float out_[]; };
|
| 9 |
|
| 10 |
void main() {
|
| 11 |
const uint i3 = gl_WorkGroupID.z;
|
| 12 |
const uint i2 = gl_WorkGroupID.y;
|
| 13 |
const uint i1 = gl_WorkGroupID.x;
|
| 14 |
|
|
|
|
|
|
|
| 15 |
float corr_dims[2];
|
| 16 |
rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims);
|
| 17 |
|
| 18 |
const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims);
|
| 19 |
|
| 20 |
+
float theta_base = float(inB[pcs.inBOff + i2]);
|
| 21 |
+
float inv_ndims = -1.f/pcs.n_dims;
|
| 22 |
+
|
| 23 |
+
float cos_theta;
|
| 24 |
+
float sin_theta;
|
| 25 |
|
| 26 |
+
for (uint i0 = 2*gl_LocalInvocationIndex; i0 < pcs.ne0; i0 += 2*gl_WorkGroupSize.x) {
|
| 27 |
+
if (i0 < pcs.n_dims) {
|
| 28 |
+
uint ic = i0/2;
|
| 29 |
|
| 30 |
+
float theta = theta_base * pow(pcs.freq_base, inv_ndims*i0);
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
+
const float freq_factor = pcs.has_freq_factors ? inC[pcs.inCOff + ic] : 1.0f;
|
| 33 |
+
|
| 34 |
+
rope_yarn(theta/freq_factor, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta);
|
| 35 |
|
| 36 |
const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
|
| 37 |
const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_
|
|
|
|
| 41 |
|
| 42 |
out_[dst_data] = x0*cos_theta - x1*sin_theta;
|
| 43 |
out_[dst_data+1] = x0*sin_theta + x1*cos_theta;
|
| 44 |
+
} else {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in
|
| 46 |
const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_
|
| 47 |
|
| 48 |
+
out_[dst_data] = inA[src];
|
| 49 |
+
out_[dst_data+1] = inA[src+1];
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
}
|
| 51 |
}
|
| 52 |
}
|
ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp
CHANGED
|
@@ -18,6 +18,10 @@ layout(push_constant) uniform PushConstants {
|
|
| 18 |
int ne01;
|
| 19 |
int ne02;
|
| 20 |
float scale;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
int mask;
|
| 22 |
} pcs;
|
| 23 |
|
|
@@ -34,17 +38,29 @@ void main() {
|
|
| 34 |
const uint pmask = i01*pcs.ne00 + pcs.inBOff; // Based from inB
|
| 35 |
const uint pdst = extra_off + pcs.outOff; // Based from out_
|
| 36 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
// parallel max
|
| 38 |
float localMax = uintBitsToFloat(0xFF800000);
|
| 39 |
for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) {
|
| 40 |
-
localMax = max(localMax, inA[psrc0 + i00]*pcs.scale + (pcs.mask!=0 ? inB[pmask + i00] : 0.0f));
|
| 41 |
}
|
| 42 |
float max_ = subgroupMax(localMax);
|
| 43 |
|
| 44 |
// parallel sum
|
| 45 |
float localSum = 0.0f;
|
| 46 |
for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) {
|
| 47 |
-
const float exp_psrc0 = exp(inA[psrc0 + i00]*pcs.scale + (pcs.mask!=0 ? inB[pmask + i00] : 0.0f) - max_);
|
| 48 |
localSum += exp_psrc0;
|
| 49 |
out_[pdst + i00] = exp_psrc0;
|
| 50 |
}
|
|
|
|
| 18 |
int ne01;
|
| 19 |
int ne02;
|
| 20 |
float scale;
|
| 21 |
+
float max_bias;
|
| 22 |
+
float m0;
|
| 23 |
+
float m1;
|
| 24 |
+
uint n_head_log2;
|
| 25 |
int mask;
|
| 26 |
} pcs;
|
| 27 |
|
|
|
|
| 38 |
const uint pmask = i01*pcs.ne00 + pcs.inBOff; // Based from inB
|
| 39 |
const uint pdst = extra_off + pcs.outOff; // Based from out_
|
| 40 |
|
| 41 |
+
float slope = 1.0f;
|
| 42 |
+
|
| 43 |
+
// ALiBi
|
| 44 |
+
if (pcs.max_bias > 0.0f) {
|
| 45 |
+
int64_t h = i02;
|
| 46 |
+
|
| 47 |
+
float base = h < pcs.n_head_log2 ? pcs.m0 : pcs.m1;
|
| 48 |
+
int64_t exp = h < pcs.n_head_log2 ? h + 1 : 2*(h - pcs.n_head_log2) + 1;
|
| 49 |
+
|
| 50 |
+
slope = pow(base, float(exp));
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
// parallel max
|
| 54 |
float localMax = uintBitsToFloat(0xFF800000);
|
| 55 |
for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) {
|
| 56 |
+
localMax = max(localMax, inA[psrc0 + i00]*pcs.scale + (pcs.mask!=0 ? slope*inB[pmask + i00] : 0.0f));
|
| 57 |
}
|
| 58 |
float max_ = subgroupMax(localMax);
|
| 59 |
|
| 60 |
// parallel sum
|
| 61 |
float localSum = 0.0f;
|
| 62 |
for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) {
|
| 63 |
+
const float exp_psrc0 = exp(inA[psrc0 + i00]*pcs.scale + (pcs.mask!=0 ? slope*inB[pmask + i00] : 0.0f) - max_);
|
| 64 |
localSum += exp_psrc0;
|
| 65 |
out_[pdst + i00] = exp_psrc0;
|
| 66 |
}
|
ggml/src/ggml-kompute/kompute-shaders/rope_common.comp
CHANGED
|
@@ -8,12 +8,14 @@ layout(local_size_x = 1) in;
|
|
| 8 |
layout (push_constant) uniform parameter {
|
| 9 |
uint inAOff;
|
| 10 |
uint inBOff;
|
|
|
|
| 11 |
uint outOff;
|
| 12 |
int n_dims;
|
| 13 |
int mode;
|
| 14 |
int n_ctx_orig;
|
| 15 |
float freq_base;
|
| 16 |
float freq_scale;
|
|
|
|
| 17 |
float ext_factor;
|
| 18 |
float attn_factor;
|
| 19 |
float beta_fast;
|
|
|
|
| 8 |
layout (push_constant) uniform parameter {
|
| 9 |
uint inAOff;
|
| 10 |
uint inBOff;
|
| 11 |
+
uint inCOff;
|
| 12 |
uint outOff;
|
| 13 |
int n_dims;
|
| 14 |
int mode;
|
| 15 |
int n_ctx_orig;
|
| 16 |
float freq_base;
|
| 17 |
float freq_scale;
|
| 18 |
+
bool has_freq_factors;
|
| 19 |
float ext_factor;
|
| 20 |
float attn_factor;
|
| 21 |
float beta_fast;
|