Spaces:
Running
Running
sycl : update IQ1_S kernels (WIP - not working!) (llama/5995)
Browse files* sycl : try to fix after IQ1_S changes
* sycl : iq1s_grid -> iq1s_grid_gpu
* sycl : fix grid type
- ggml-sycl.cpp +34 -32
ggml-sycl.cpp
CHANGED
|
@@ -3514,8 +3514,8 @@ static_assert(sizeof(block_iq3_s) == sizeof(ggml_fp16_t) + 13*(QK_K/32) + IQ3S_N
|
|
| 3514 |
#define QI1_S (QK_K / (4*QR1_S))
|
| 3515 |
typedef struct {
|
| 3516 |
sycl::half d;
|
| 3517 |
-
uint8_t
|
| 3518 |
-
|
| 3519 |
} block_iq1_s;
|
| 3520 |
static_assert(sizeof(block_iq1_s) == sizeof(ggml_fp16_t) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");
|
| 3521 |
|
|
@@ -4891,10 +4891,9 @@ static void dequantize_block_iq3_s(const void * __restrict__ vx, dst_t * __restr
|
|
| 4891 |
template<typename dst_t>
|
| 4892 |
static void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restrict__ yy,
|
| 4893 |
const sycl::nd_item<3> &item_ct1,
|
| 4894 |
-
const
|
| 4895 |
const uint8_t *ksigns_iq2xs,
|
| 4896 |
const uint8_t *kmask_iq2xs) {
|
| 4897 |
-
|
| 4898 |
const int i = item_ct1.get_group(2);
|
| 4899 |
const block_iq1_s * x = (const block_iq1_s *) vx;
|
| 4900 |
|
|
@@ -4903,11 +4902,15 @@ static void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restr
|
|
| 4903 |
const int il = tid/8; // 0...3
|
| 4904 |
const int ib = tid%8; // 0...7
|
| 4905 |
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
| 4906 |
-
const
|
| 4907 |
-
uint8_t
|
| 4908 |
-
const
|
| 4909 |
-
const float d = (float)x[i].d * (2*(
|
| 4910 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4911 |
#else
|
| 4912 |
assert(false);
|
| 4913 |
#endif
|
|
@@ -7803,28 +7806,27 @@ vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
|
|
| 7803 |
static __dpct_inline__ float
|
| 7804 |
vec_dot_iq1_s_q8_1(const void *__restrict__ vbq,
|
| 7805 |
const block_q8_1 *__restrict__ bq8_1, const int &iqs,
|
| 7806 |
-
const
|
| 7807 |
#if QK_K == 256
|
| 7808 |
const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
|
| 7809 |
|
| 7810 |
const int ib32 = iqs;
|
| 7811 |
-
|
| 7812 |
-
const
|
| 7813 |
-
|
| 7814 |
-
|
| 7815 |
-
|
| 7816 |
-
|
| 7817 |
-
|
| 7818 |
-
|
| 7819 |
-
|
| 7820 |
-
|
| 7821 |
-
|
| 7822 |
-
|
| 7823 |
-
|
| 7824 |
-
}
|
| 7825 |
-
const float d = (float)bq1->d
|
| 7826 |
-
return d *
|
| 7827 |
-
sumi3 * (2*(h2 & 7) + 1) + sumi4 * (2*((h2 >> 4) & 7) + 1));
|
| 7828 |
#else
|
| 7829 |
assert(false);
|
| 7830 |
return 0.f;
|
|
@@ -8644,7 +8646,7 @@ static void mul_mat_vec_q_iq3_s_q8_1(const void * __restrict__ vx, const void *
|
|
| 8644 |
template <int qk, int qi, typename block_q_t, int vdr>
|
| 8645 |
static void mul_mat_vec_q_iq1_s_q8_1(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows,
|
| 8646 |
const sycl::nd_item<3> &item_ct1,
|
| 8647 |
-
const
|
| 8648 |
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
|
| 8649 |
item_ct1.get_local_id(1);
|
| 8650 |
|
|
@@ -10406,7 +10408,7 @@ static void dequantize_row_iq1_s_sycl(const void *vx, dst_t *y, const int k,
|
|
| 10406 |
dpct::queue_ptr stream) {
|
| 10407 |
const int nb = k / QK_K;
|
| 10408 |
{
|
| 10409 |
-
|
| 10410 |
ksigns_iq2xs.init(*stream);
|
| 10411 |
kmask_iq2xs.init(*stream);
|
| 10412 |
|
|
@@ -10414,7 +10416,7 @@ static void dequantize_row_iq1_s_sycl(const void *vx, dst_t *y, const int k,
|
|
| 10414 |
{sycl::aspect::fp16});
|
| 10415 |
|
| 10416 |
stream->submit([&](sycl::handler &cgh) {
|
| 10417 |
-
auto iq1s_grid_ptr_ct1 =
|
| 10418 |
auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr();
|
| 10419 |
auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr();
|
| 10420 |
|
|
@@ -11154,11 +11156,11 @@ static void mul_mat_vec_iq1_s_q8_1_sycl(const void *vx, const void *vy,
|
|
| 11154 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 11155 |
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
| 11156 |
{
|
| 11157 |
-
|
| 11158 |
ksigns64.init(*stream);
|
| 11159 |
|
| 11160 |
stream->submit([&](sycl::handler &cgh) {
|
| 11161 |
-
auto iq1s_grid_ptr_ct1 =
|
| 11162 |
auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
|
| 11163 |
|
| 11164 |
cgh.parallel_for(
|
|
|
|
| 3514 |
#define QI1_S (QK_K / (4*QR1_S))
|
| 3515 |
typedef struct {
|
| 3516 |
sycl::half d;
|
| 3517 |
+
uint8_t qs[QK_K/8];
|
| 3518 |
+
uint16_t qh[QK_K/32];
|
| 3519 |
} block_iq1_s;
|
| 3520 |
static_assert(sizeof(block_iq1_s) == sizeof(ggml_fp16_t) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");
|
| 3521 |
|
|
|
|
| 4891 |
template<typename dst_t>
|
| 4892 |
static void dequantize_block_iq1_s(const void * __restrict__ vx, dst_t * __restrict__ yy,
|
| 4893 |
const sycl::nd_item<3> &item_ct1,
|
| 4894 |
+
const uint32_t *iq1s_grid,
|
| 4895 |
const uint8_t *ksigns_iq2xs,
|
| 4896 |
const uint8_t *kmask_iq2xs) {
|
|
|
|
| 4897 |
const int i = item_ct1.get_group(2);
|
| 4898 |
const block_iq1_s * x = (const block_iq1_s *) vx;
|
| 4899 |
|
|
|
|
| 4902 |
const int il = tid/8; // 0...3
|
| 4903 |
const int ib = tid%8; // 0...7
|
| 4904 |
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
|
| 4905 |
+
const uint8_t * qs = x[i].qs + 8*ib;
|
| 4906 |
+
const uint8_t * grid1 = (const uint8_t *)(iq1s_grid + qs[2*il+0]);
|
| 4907 |
+
const uint8_t * grid2 = (const uint8_t *)(iq1s_grid + qs[2*il+1]);
|
| 4908 |
+
const float d = (float)x[i].d * (2*((x[i].qh[ib] >> 12) & 0xf) + 1);
|
| 4909 |
+
const uint8_t signs = ksigns_iq2xs[(x[i].qh[ib] >> 3*il) & 7];
|
| 4910 |
+
for (int j = 0; j < 4; ++j) {
|
| 4911 |
+
y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
|
| 4912 |
+
y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
|
| 4913 |
+
}
|
| 4914 |
#else
|
| 4915 |
assert(false);
|
| 4916 |
#endif
|
|
|
|
| 7806 |
static __dpct_inline__ float
|
| 7807 |
vec_dot_iq1_s_q8_1(const void *__restrict__ vbq,
|
| 7808 |
const block_q8_1 *__restrict__ bq8_1, const int &iqs,
|
| 7809 |
+
const uint32_t *iq1s_grid, const uint64_t *ksigns64) {
|
| 7810 |
#if QK_K == 256
|
| 7811 |
const block_iq1_s * bq1 = (const block_iq1_s *) vbq;
|
| 7812 |
|
| 7813 |
const int ib32 = iqs;
|
| 7814 |
+
const uint8_t * qs = bq1->qs + 4*ib32;
|
| 7815 |
+
const int8_t * q8 = bq8_1[ib32].qs;
|
| 7816 |
+
int sumi = 0;
|
| 7817 |
+
for (int l = 0; l < 4; ++l) {
|
| 7818 |
+
const uint32_t * grid = (const uint32_t *)(iq1s_grid + qs[l]);
|
| 7819 |
+
const uint32_t * signs = (const uint32_t *)(ksigns64 + (qs[l] >> 8));
|
| 7820 |
+
const int grid_l = dpct::vectorized_binary<sycl::uchar4>(
|
| 7821 |
+
grid[0] ^ signs[0], signs[0], std::minus<>());
|
| 7822 |
+
const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
|
| 7823 |
+
grid[1] ^ signs[1], signs[1], std::minus<>());
|
| 7824 |
+
sumi = dpct::dp4a(grid_l, *((int *)q8 + 0), sumi);
|
| 7825 |
+
sumi = dpct::dp4a(grid_h, *((int *)q8 + 1), sumi);
|
| 7826 |
+
q8 += 8;
|
| 7827 |
+
}
|
| 7828 |
+
const float d = (float)bq1->d * bq8_1[ib32].ds[0] * 0.25f;
|
| 7829 |
+
return d * sumi;
|
|
|
|
| 7830 |
#else
|
| 7831 |
assert(false);
|
| 7832 |
return 0.f;
|
|
|
|
| 8646 |
template <int qk, int qi, typename block_q_t, int vdr>
|
| 8647 |
static void mul_mat_vec_q_iq1_s_q8_1(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows,
|
| 8648 |
const sycl::nd_item<3> &item_ct1,
|
| 8649 |
+
const uint32_t *iq1s_grid_ptr, const uint64_t *ksigns64_ptr ) {
|
| 8650 |
const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
|
| 8651 |
item_ct1.get_local_id(1);
|
| 8652 |
|
|
|
|
| 10408 |
dpct::queue_ptr stream) {
|
| 10409 |
const int nb = k / QK_K;
|
| 10410 |
{
|
| 10411 |
+
iq1s_grid_gpu.init(*stream);
|
| 10412 |
ksigns_iq2xs.init(*stream);
|
| 10413 |
kmask_iq2xs.init(*stream);
|
| 10414 |
|
|
|
|
| 10416 |
{sycl::aspect::fp16});
|
| 10417 |
|
| 10418 |
stream->submit([&](sycl::handler &cgh) {
|
| 10419 |
+
auto iq1s_grid_ptr_ct1 = iq1s_grid_gpu.get_ptr();
|
| 10420 |
auto ksigns_iq2xs_ptr_ct1 = ksigns_iq2xs.get_ptr();
|
| 10421 |
auto kmask_iq2xs_ptr_ct1 = kmask_iq2xs.get_ptr();
|
| 10422 |
|
|
|
|
| 11156 |
const sycl::range<3> block_nums(1, 1, block_num_y);
|
| 11157 |
const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
|
| 11158 |
{
|
| 11159 |
+
iq1s_grid_gpu.init(*stream);
|
| 11160 |
ksigns64.init(*stream);
|
| 11161 |
|
| 11162 |
stream->submit([&](sycl::handler &cgh) {
|
| 11163 |
+
auto iq1s_grid_ptr_ct1 = iq1s_grid_gpu.get_ptr();
|
| 11164 |
auto ksigns64_ptr_ct1 = ksigns64.get_ptr();
|
| 11165 |
|
| 11166 |
cgh.parallel_for(
|