Spaces:
Running
Running
ggml : reuse quantum structs across backends (llama/5943)
Browse files* ggml : reuse quant blocks across backends
ggml-ci
* ggml : define helper constants only for CUDA and SYCL
ggml-ci
* ggml : define helper quantum constants for SYCL
ggml-ci
- ggml-cuda.cu +11 -233
- ggml-metal.m +1 -1
- ggml-metal.metal +4 -176
- ggml-quants.c +41 -37
- ggml-quants.h +4 -240
- ggml-sycl.cpp +1 -191
ggml-cuda.cu
CHANGED
|
@@ -2,7 +2,13 @@
|
|
| 2 |
#include "ggml.h"
|
| 3 |
#include "ggml-backend-impl.h"
|
| 4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
#define GGML_COMMON_IMPL_CUDA
|
|
|
|
| 6 |
#include "ggml-common.h"
|
| 7 |
|
| 8 |
#include <algorithm>
|
|
@@ -359,66 +365,6 @@ typedef void (*ggml_cuda_op_flatten_t)(
|
|
| 359 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 360 |
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream);
|
| 361 |
|
| 362 |
-
// QK = number of values after dequantization
|
| 363 |
-
// QR = QK / number of values before dequantization
|
| 364 |
-
// QI = number of 32 bit integers before dequantization
|
| 365 |
-
|
| 366 |
-
#define QK4_0 32
|
| 367 |
-
#define QR4_0 2
|
| 368 |
-
#define QI4_0 (QK4_0 / (4 * QR4_0))
|
| 369 |
-
typedef struct {
|
| 370 |
-
half d; // delta
|
| 371 |
-
uint8_t qs[QK4_0 / 2]; // nibbles / quants
|
| 372 |
-
} block_q4_0;
|
| 373 |
-
static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
|
| 374 |
-
|
| 375 |
-
#define QK4_1 32
|
| 376 |
-
#define QR4_1 2
|
| 377 |
-
#define QI4_1 (QK4_1 / (4 * QR4_1))
|
| 378 |
-
typedef struct {
|
| 379 |
-
half2 dm; // dm.x = delta, dm.y = min
|
| 380 |
-
uint8_t qs[QK4_1 / 2]; // nibbles / quants
|
| 381 |
-
} block_q4_1;
|
| 382 |
-
static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
|
| 383 |
-
|
| 384 |
-
#define QK5_0 32
|
| 385 |
-
#define QR5_0 2
|
| 386 |
-
#define QI5_0 (QK5_0 / (4 * QR5_0))
|
| 387 |
-
typedef struct {
|
| 388 |
-
half d; // delta
|
| 389 |
-
uint8_t qh[4]; // 5-th bit of quants
|
| 390 |
-
uint8_t qs[QK5_0 / 2]; // nibbles / quants
|
| 391 |
-
} block_q5_0;
|
| 392 |
-
static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
|
| 393 |
-
|
| 394 |
-
#define QK5_1 32
|
| 395 |
-
#define QR5_1 2
|
| 396 |
-
#define QI5_1 (QK5_1 / (4 * QR5_1))
|
| 397 |
-
typedef struct {
|
| 398 |
-
half2 dm; // dm.x = delta, dm.y = min
|
| 399 |
-
uint8_t qh[4]; // 5-th bit of quants
|
| 400 |
-
uint8_t qs[QK5_1 / 2]; // nibbles / quants
|
| 401 |
-
} block_q5_1;
|
| 402 |
-
static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
|
| 403 |
-
|
| 404 |
-
#define QK8_0 32
|
| 405 |
-
#define QR8_0 1
|
| 406 |
-
#define QI8_0 (QK8_0 / (4 * QR8_0))
|
| 407 |
-
typedef struct {
|
| 408 |
-
half d; // delta
|
| 409 |
-
int8_t qs[QK8_0]; // quants
|
| 410 |
-
} block_q8_0;
|
| 411 |
-
static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
|
| 412 |
-
|
| 413 |
-
#define QK8_1 32
|
| 414 |
-
#define QR8_1 1
|
| 415 |
-
#define QI8_1 (QK8_1 / (4 * QR8_1))
|
| 416 |
-
typedef struct {
|
| 417 |
-
half2 ds; // ds.x = delta, ds.y = sum
|
| 418 |
-
int8_t qs[QK8_0]; // quants
|
| 419 |
-
} block_q8_1;
|
| 420 |
-
static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding");
|
| 421 |
-
|
| 422 |
typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs);
|
| 423 |
typedef void (*allocate_tiles_cuda_t)(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc);
|
| 424 |
typedef void (*load_tiles_cuda_t)(
|
|
@@ -428,174 +374,6 @@ typedef float (*vec_dot_q_mul_mat_cuda_t)(
|
|
| 428 |
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
| 429 |
const int * __restrict__ y_qs, const half2 * __restrict__ y_ms, const int & i, const int & j, const int & k);
|
| 430 |
|
| 431 |
-
//================================= k-quants
|
| 432 |
-
|
| 433 |
-
#ifdef GGML_QKK_64
|
| 434 |
-
#define QK_K 64
|
| 435 |
-
#define K_SCALE_SIZE 4
|
| 436 |
-
#else
|
| 437 |
-
#define QK_K 256
|
| 438 |
-
#define K_SCALE_SIZE 12
|
| 439 |
-
#endif
|
| 440 |
-
|
| 441 |
-
#define QR2_K 4
|
| 442 |
-
#define QI2_K (QK_K / (4*QR2_K))
|
| 443 |
-
typedef struct {
|
| 444 |
-
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
|
| 445 |
-
uint8_t qs[QK_K/4]; // quants
|
| 446 |
-
half2 dm; // super-block scale for quantized scales/mins
|
| 447 |
-
} block_q2_K;
|
| 448 |
-
static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
|
| 449 |
-
|
| 450 |
-
#define QR3_K 4
|
| 451 |
-
#define QI3_K (QK_K / (4*QR3_K))
|
| 452 |
-
typedef struct {
|
| 453 |
-
uint8_t hmask[QK_K/8]; // quants - high bit
|
| 454 |
-
uint8_t qs[QK_K/4]; // quants - low 2 bits
|
| 455 |
-
#ifdef GGML_QKK_64
|
| 456 |
-
uint8_t scales[2]; // scales, quantized with 8 bits
|
| 457 |
-
#else
|
| 458 |
-
uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
|
| 459 |
-
#endif
|
| 460 |
-
half d; // super-block scale
|
| 461 |
-
} block_q3_K;
|
| 462 |
-
//static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + K_SCALE_SIZE, "wrong q3_K block size/padding");
|
| 463 |
-
|
| 464 |
-
#define QR4_K 2
|
| 465 |
-
#define QI4_K (QK_K / (4*QR4_K))
|
| 466 |
-
#ifdef GGML_QKK_64
|
| 467 |
-
typedef struct {
|
| 468 |
-
half dm[2]; // super-block scales/mins
|
| 469 |
-
uint8_t scales[2]; // 4-bit block scales/mins
|
| 470 |
-
uint8_t qs[QK_K/2]; // 4--bit quants
|
| 471 |
-
} block_q4_K;
|
| 472 |
-
static_assert(sizeof(block_q4_K) == sizeof(half2) + QK_K/2 + 2, "wrong q4_K block size/padding");
|
| 473 |
-
#else
|
| 474 |
-
typedef struct {
|
| 475 |
-
half2 dm; // super-block scale for quantized scales/mins
|
| 476 |
-
uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
|
| 477 |
-
uint8_t qs[QK_K/2]; // 4--bit quants
|
| 478 |
-
} block_q4_K;
|
| 479 |
-
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
|
| 480 |
-
#endif
|
| 481 |
-
|
| 482 |
-
#define QR5_K 2
|
| 483 |
-
#define QI5_K (QK_K / (4*QR5_K))
|
| 484 |
-
#ifdef GGML_QKK_64
|
| 485 |
-
typedef struct {
|
| 486 |
-
half d; // super-block scale
|
| 487 |
-
int8_t scales[QK_K/16]; // block scales
|
| 488 |
-
uint8_t qh[QK_K/8]; // quants, high bit
|
| 489 |
-
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
| 490 |
-
} block_q5_K;
|
| 491 |
-
static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
|
| 492 |
-
#else
|
| 493 |
-
typedef struct {
|
| 494 |
-
half2 dm; // super-block scale for quantized scales/mins
|
| 495 |
-
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
|
| 496 |
-
uint8_t qh[QK_K/8]; // quants, high bit
|
| 497 |
-
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
| 498 |
-
} block_q5_K;
|
| 499 |
-
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
|
| 500 |
-
#endif
|
| 501 |
-
|
| 502 |
-
#define QR6_K 2
|
| 503 |
-
#define QI6_K (QK_K / (4*QR6_K))
|
| 504 |
-
typedef struct {
|
| 505 |
-
uint8_t ql[QK_K/2]; // quants, lower 4 bits
|
| 506 |
-
uint8_t qh[QK_K/4]; // quants, upper 2 bits
|
| 507 |
-
int8_t scales[QK_K/16]; // scales
|
| 508 |
-
half d; // delta
|
| 509 |
-
} block_q6_K;
|
| 510 |
-
static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding");
|
| 511 |
-
|
| 512 |
-
#define QR2_XXS 8
|
| 513 |
-
#define QI2_XXS (QK_K / (4*QR2_XXS))
|
| 514 |
-
typedef struct {
|
| 515 |
-
half d;
|
| 516 |
-
uint16_t qs[QK_K/8];
|
| 517 |
-
} block_iq2_xxs;
|
| 518 |
-
static_assert(sizeof(block_iq2_xxs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t), "wrong iq2_xxs block size/padding");
|
| 519 |
-
|
| 520 |
-
#define QR2_XS 8
|
| 521 |
-
#define QI2_XS (QK_K / (4*QR2_XS))
|
| 522 |
-
typedef struct {
|
| 523 |
-
half d;
|
| 524 |
-
uint16_t qs[QK_K/8];
|
| 525 |
-
uint8_t scales[QK_K/32];
|
| 526 |
-
} block_iq2_xs;
|
| 527 |
-
static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding");
|
| 528 |
-
|
| 529 |
-
// 2.5625 bpw quants
|
| 530 |
-
#define QR2_S 8
|
| 531 |
-
#define QI2_S (QK_K / (4*QR2_S))
|
| 532 |
-
typedef struct {
|
| 533 |
-
half d;
|
| 534 |
-
uint8_t qs[QK_K/4];
|
| 535 |
-
uint8_t qh[QK_K/32];
|
| 536 |
-
uint8_t scales[QK_K/32];
|
| 537 |
-
} block_iq2_s;
|
| 538 |
-
static_assert(sizeof(block_iq2_s) == sizeof(ggml_fp16_t) + QK_K/4 + QK_K/16, "wrong iq2_s block size/padding");
|
| 539 |
-
|
| 540 |
-
#define QR3_XXS 8
|
| 541 |
-
#define QI3_XXS (QK_K / (4*QR3_XXS))
|
| 542 |
-
typedef struct {
|
| 543 |
-
half d;
|
| 544 |
-
uint8_t qs[3*(QK_K/8)];
|
| 545 |
-
} block_iq3_xxs;
|
| 546 |
-
static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
|
| 547 |
-
|
| 548 |
-
#define QR3_XS 8
|
| 549 |
-
#define QI3_XS (QK_K / (4*QR3_XS))
|
| 550 |
-
#if QK_K == 64
|
| 551 |
-
#define IQ3S_N_SCALE 2
|
| 552 |
-
#else
|
| 553 |
-
#define IQ3S_N_SCALE QK_K/64
|
| 554 |
-
#endif
|
| 555 |
-
typedef struct {
|
| 556 |
-
half d;
|
| 557 |
-
uint8_t qs[QK_K/4];
|
| 558 |
-
uint8_t qh[QK_K/32];
|
| 559 |
-
uint8_t signs[QK_K/8];
|
| 560 |
-
uint8_t scales[IQ3S_N_SCALE];
|
| 561 |
-
} block_iq3_s;
|
| 562 |
-
static_assert(sizeof(block_iq3_s) == sizeof(ggml_fp16_t) + 13*(QK_K/32) + IQ3S_N_SCALE, "wrong iq3_s block size/padding");
|
| 563 |
-
|
| 564 |
-
#define QR1_S 8
|
| 565 |
-
#define QI1_S (QK_K / (4*QR1_S))
|
| 566 |
-
typedef struct {
|
| 567 |
-
half d;
|
| 568 |
-
uint8_t qs[QK_K/8];
|
| 569 |
-
uint16_t qh[QK_K/32];
|
| 570 |
-
} block_iq1_s;
|
| 571 |
-
static_assert(sizeof(block_iq1_s) == sizeof(ggml_fp16_t) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");
|
| 572 |
-
|
| 573 |
-
#define QK4_NL 32
|
| 574 |
-
#define QR4_NL 2
|
| 575 |
-
#define QI4_NL (QK4_NL / (4*QR4_NL))
|
| 576 |
-
typedef struct {
|
| 577 |
-
half d;
|
| 578 |
-
uint8_t qs[QK4_NL/2];
|
| 579 |
-
} block_iq4_nl;
|
| 580 |
-
static_assert(sizeof(block_iq4_nl) == sizeof(ggml_fp16_t) + QK4_NL/2, "wrong iq4_nl block size/padding");
|
| 581 |
-
|
| 582 |
-
#if QK_K == 64
|
| 583 |
-
#define block_iq4_xs block_iq4_nl
|
| 584 |
-
#define QR4_XS QR4_NL
|
| 585 |
-
#define QI4_XS QI4_NL
|
| 586 |
-
#else
|
| 587 |
-
// QR4_XS = 8 is very slightly faster than QR4_XS = 4
|
| 588 |
-
#define QR4_XS 8
|
| 589 |
-
#define QI4_XS (QK_K / (4*QR4_XS))
|
| 590 |
-
typedef struct {
|
| 591 |
-
half d;
|
| 592 |
-
uint16_t scales_h;
|
| 593 |
-
uint8_t scales_l[QK_K/64];
|
| 594 |
-
uint8_t qs[QK_K/2];
|
| 595 |
-
} block_iq4_xs;
|
| 596 |
-
static_assert(sizeof(block_iq4_xs) == sizeof(ggml_fp16_t) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
|
| 597 |
-
#endif
|
| 598 |
-
|
| 599 |
#define WARP_SIZE 32
|
| 600 |
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
| 601 |
|
|
@@ -3570,7 +3348,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
|
|
| 3570 |
#pragma unroll
|
| 3571 |
for (int i = 0; i < QR2_K; ++ i) {
|
| 3572 |
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
|
| 3573 |
-
d8[i] =
|
| 3574 |
}
|
| 3575 |
|
| 3576 |
return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
|
|
@@ -3692,7 +3470,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
|
|
| 3692 |
#pragma unroll
|
| 3693 |
for (int i = 0; i < QR3_K; ++i) {
|
| 3694 |
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
|
| 3695 |
-
d8[i] =
|
| 3696 |
}
|
| 3697 |
|
| 3698 |
return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
|
|
@@ -3861,7 +3639,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
|
|
| 3861 |
|
| 3862 |
for (int i = 0; i < QR4_K; ++i) {
|
| 3863 |
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
| 3864 |
-
d8[i] =
|
| 3865 |
|
| 3866 |
const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
|
| 3867 |
u[2*i+0] = q8[0];
|
|
@@ -4226,7 +4004,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1(
|
|
| 4226 |
#pragma unroll
|
| 4227 |
for (int i = 0; i < QR6_K; ++i) {
|
| 4228 |
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
|
| 4229 |
-
d8[i] =
|
| 4230 |
}
|
| 4231 |
|
| 4232 |
return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
|
|
@@ -4763,7 +4541,7 @@ static __device__ __forceinline__ void mul_mat_q(
|
|
| 4763 |
*dsi_dst = *dsi_src;
|
| 4764 |
} else {
|
| 4765 |
float * dfi_dst = (float *) dsi_dst;
|
| 4766 |
-
*dfi_dst =
|
| 4767 |
}
|
| 4768 |
}
|
| 4769 |
|
|
|
|
| 2 |
#include "ggml.h"
|
| 3 |
#include "ggml-backend-impl.h"
|
| 4 |
|
| 5 |
+
#if defined(GGML_USE_HIPBLAS)
|
| 6 |
+
#define GGML_COMMON_DECL_HIP
|
| 7 |
+
#define GGML_COMMON_IMPL_HIP
|
| 8 |
+
#else
|
| 9 |
+
#define GGML_COMMON_DECL_CUDA
|
| 10 |
#define GGML_COMMON_IMPL_CUDA
|
| 11 |
+
#endif
|
| 12 |
#include "ggml-common.h"
|
| 13 |
|
| 14 |
#include <algorithm>
|
|
|
|
| 365 |
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
|
| 366 |
const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream);
|
| 367 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 368 |
typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs);
|
| 369 |
typedef void (*allocate_tiles_cuda_t)(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc);
|
| 370 |
typedef void (*load_tiles_cuda_t)(
|
|
|
|
| 374 |
const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc,
|
| 375 |
const int * __restrict__ y_qs, const half2 * __restrict__ y_ms, const int & i, const int & j, const int & k);
|
| 376 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 377 |
#define WARP_SIZE 32
|
| 378 |
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
| 379 |
|
|
|
|
| 3348 |
#pragma unroll
|
| 3349 |
for (int i = 0; i < QR2_K; ++ i) {
|
| 3350 |
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
|
| 3351 |
+
d8[i] = __low2float(bq8_1[bq8_offset + i].ds);
|
| 3352 |
}
|
| 3353 |
|
| 3354 |
return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
|
|
|
|
| 3470 |
#pragma unroll
|
| 3471 |
for (int i = 0; i < QR3_K; ++i) {
|
| 3472 |
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
|
| 3473 |
+
d8[i] = __low2float(bq8_1[bq8_offset + i].ds);
|
| 3474 |
}
|
| 3475 |
|
| 3476 |
return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
|
|
|
|
| 3639 |
|
| 3640 |
for (int i = 0; i < QR4_K; ++i) {
|
| 3641 |
const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
|
| 3642 |
+
d8[i] = __low2float(bq8i->ds);
|
| 3643 |
|
| 3644 |
const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
|
| 3645 |
u[2*i+0] = q8[0];
|
|
|
|
| 4004 |
#pragma unroll
|
| 4005 |
for (int i = 0; i < QR6_K; ++i) {
|
| 4006 |
u[i] = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
|
| 4007 |
+
d8[i] = __low2float(bq8_1[bq8_offset + 2*i].ds);
|
| 4008 |
}
|
| 4009 |
|
| 4010 |
return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
|
|
|
|
| 4541 |
*dsi_dst = *dsi_src;
|
| 4542 |
} else {
|
| 4543 |
float * dfi_dst = (float *) dsi_dst;
|
| 4544 |
+
*dfi_dst = __low2float(*dsi_src);
|
| 4545 |
}
|
| 4546 |
}
|
| 4547 |
|
ggml-metal.m
CHANGED
|
@@ -336,7 +336,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
| 336 |
NSMutableDictionary * prep = [NSMutableDictionary dictionary];
|
| 337 |
|
| 338 |
#ifdef GGML_QKK_64
|
| 339 |
-
prep[@"
|
| 340 |
#endif
|
| 341 |
|
| 342 |
MTLCompileOptions* options = [MTLCompileOptions new];
|
|
|
|
| 336 |
NSMutableDictionary * prep = [NSMutableDictionary dictionary];
|
| 337 |
|
| 338 |
#ifdef GGML_QKK_64
|
| 339 |
+
prep[@"GGML_QKK_64"] = @(1);
|
| 340 |
#endif
|
| 341 |
|
| 342 |
MTLCompileOptions* options = [MTLCompileOptions new];
|
ggml-metal.metal
CHANGED
|
@@ -1,3 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
#include <metal_stdlib>
|
| 2 |
|
| 3 |
#define GGML_COMMON_IMPL_METAL
|
|
@@ -9,41 +13,6 @@ using namespace metal;
|
|
| 9 |
#define MIN(x, y) ((x) < (y) ? (x) : (y))
|
| 10 |
#define SWAP(x, y) { auto tmp = (x); (x) = (y); (y) = tmp; }
|
| 11 |
|
| 12 |
-
#define QK4_0 32
|
| 13 |
-
#define QR4_0 2
|
| 14 |
-
typedef struct {
|
| 15 |
-
half d; // delta
|
| 16 |
-
uint8_t qs[QK4_0 / 2]; // nibbles / quants
|
| 17 |
-
} block_q4_0;
|
| 18 |
-
|
| 19 |
-
#define QK4_1 32
|
| 20 |
-
typedef struct {
|
| 21 |
-
half d; // delta
|
| 22 |
-
half m; // min
|
| 23 |
-
uint8_t qs[QK4_1 / 2]; // nibbles / quants
|
| 24 |
-
} block_q4_1;
|
| 25 |
-
|
| 26 |
-
#define QK5_0 32
|
| 27 |
-
typedef struct {
|
| 28 |
-
half d; // delta
|
| 29 |
-
uint8_t qh[4]; // 5-th bit of quants
|
| 30 |
-
uint8_t qs[QK5_0 / 2]; // nibbles / quants
|
| 31 |
-
} block_q5_0;
|
| 32 |
-
|
| 33 |
-
#define QK5_1 32
|
| 34 |
-
typedef struct {
|
| 35 |
-
half d; // delta
|
| 36 |
-
half m; // min
|
| 37 |
-
uint8_t qh[4]; // 5-th bit of quants
|
| 38 |
-
uint8_t qs[QK5_1 / 2]; // nibbles / quants
|
| 39 |
-
} block_q5_1;
|
| 40 |
-
|
| 41 |
-
#define QK8_0 32
|
| 42 |
-
typedef struct {
|
| 43 |
-
half d; // delta
|
| 44 |
-
int8_t qs[QK8_0]; // quants
|
| 45 |
-
} block_q8_0;
|
| 46 |
-
|
| 47 |
#define N_SIMDWIDTH 32 // assuming SIMD group size is 32
|
| 48 |
|
| 49 |
enum ggml_sort_order {
|
|
@@ -2478,147 +2447,6 @@ kernel void kernel_concat(
|
|
| 2478 |
}
|
| 2479 |
}
|
| 2480 |
|
| 2481 |
-
//============================================ k-quants ======================================================
|
| 2482 |
-
|
| 2483 |
-
#ifndef QK_K
|
| 2484 |
-
#define QK_K 256
|
| 2485 |
-
#else
|
| 2486 |
-
static_assert(QK_K == 256 || QK_K == 64, "QK_K must be 256 or 64");
|
| 2487 |
-
#endif
|
| 2488 |
-
|
| 2489 |
-
#if QK_K == 256
|
| 2490 |
-
#define K_SCALE_SIZE 12
|
| 2491 |
-
#else
|
| 2492 |
-
#define K_SCALE_SIZE 4
|
| 2493 |
-
#endif
|
| 2494 |
-
|
| 2495 |
-
typedef struct {
|
| 2496 |
-
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
|
| 2497 |
-
uint8_t qs[QK_K/4]; // quants
|
| 2498 |
-
half d; // super-block scale for quantized scales
|
| 2499 |
-
half dmin; // super-block scale for quantized mins
|
| 2500 |
-
} block_q2_K;
|
| 2501 |
-
// 84 bytes / block
|
| 2502 |
-
|
| 2503 |
-
typedef struct {
|
| 2504 |
-
uint8_t hmask[QK_K/8]; // quants - high bit
|
| 2505 |
-
uint8_t qs[QK_K/4]; // quants - low 2 bits
|
| 2506 |
-
#if QK_K == 64
|
| 2507 |
-
uint8_t scales[2];
|
| 2508 |
-
#else
|
| 2509 |
-
uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
|
| 2510 |
-
#endif
|
| 2511 |
-
half d; // super-block scale
|
| 2512 |
-
} block_q3_K;
|
| 2513 |
-
|
| 2514 |
-
#if QK_K == 64
|
| 2515 |
-
typedef struct {
|
| 2516 |
-
half d[2]; // super-block scales/mins
|
| 2517 |
-
uint8_t scales[2];
|
| 2518 |
-
uint8_t qs[QK_K/2]; // 4-bit quants
|
| 2519 |
-
} block_q4_K;
|
| 2520 |
-
#else
|
| 2521 |
-
typedef struct {
|
| 2522 |
-
half d; // super-block scale for quantized scales
|
| 2523 |
-
half dmin; // super-block scale for quantized mins
|
| 2524 |
-
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
|
| 2525 |
-
uint8_t qs[QK_K/2]; // 4--bit quants
|
| 2526 |
-
} block_q4_K;
|
| 2527 |
-
#endif
|
| 2528 |
-
|
| 2529 |
-
#if QK_K == 64
|
| 2530 |
-
typedef struct {
|
| 2531 |
-
half d; // super-block scales/mins
|
| 2532 |
-
int8_t scales[QK_K/16]; // 8-bit block scales
|
| 2533 |
-
uint8_t qh[QK_K/8]; // quants, high bit
|
| 2534 |
-
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
| 2535 |
-
} block_q5_K;
|
| 2536 |
-
#else
|
| 2537 |
-
typedef struct {
|
| 2538 |
-
half d; // super-block scale for quantized scales
|
| 2539 |
-
half dmin; // super-block scale for quantized mins
|
| 2540 |
-
uint8_t scales[3*QK_K/64]; // scales and mins, quantized with 6 bits
|
| 2541 |
-
uint8_t qh[QK_K/8]; // quants, high bit
|
| 2542 |
-
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
| 2543 |
-
} block_q5_K;
|
| 2544 |
-
// 176 bytes / block
|
| 2545 |
-
#endif
|
| 2546 |
-
|
| 2547 |
-
typedef struct {
|
| 2548 |
-
uint8_t ql[QK_K/2]; // quants, lower 4 bits
|
| 2549 |
-
uint8_t qh[QK_K/4]; // quants, upper 2 bits
|
| 2550 |
-
int8_t scales[QK_K/16]; // scales, quantized with 8 bits
|
| 2551 |
-
half d; // super-block scale
|
| 2552 |
-
} block_q6_K;
|
| 2553 |
-
// 210 bytes / block
|
| 2554 |
-
|
| 2555 |
-
typedef struct {
|
| 2556 |
-
half d;
|
| 2557 |
-
uint16_t qs[QK_K/8];
|
| 2558 |
-
} block_iq2_xxs;
|
| 2559 |
-
// 66 bytes / block for QK_K = 256, so 2.0625 bpw
|
| 2560 |
-
|
| 2561 |
-
typedef struct {
|
| 2562 |
-
half d;
|
| 2563 |
-
uint16_t qs[QK_K/8];
|
| 2564 |
-
uint8_t scales[QK_K/32];
|
| 2565 |
-
} block_iq2_xs;
|
| 2566 |
-
// 74 bytes / block for QK_K = 256, so 2.3125 bpw
|
| 2567 |
-
|
| 2568 |
-
// 2.5625 bpw quants
|
| 2569 |
-
typedef struct {
|
| 2570 |
-
half d;
|
| 2571 |
-
uint8_t qs[QK_K/4];
|
| 2572 |
-
uint8_t qh[QK_K/32];
|
| 2573 |
-
uint8_t scales[QK_K/32];
|
| 2574 |
-
} block_iq2_s;
|
| 2575 |
-
|
| 2576 |
-
typedef struct {
|
| 2577 |
-
half d;
|
| 2578 |
-
uint8_t qs[3*QK_K/8];
|
| 2579 |
-
} block_iq3_xxs;
|
| 2580 |
-
// 98 bytes / block for QK_K = 256, so 3.0625 bpw
|
| 2581 |
-
|
| 2582 |
-
// 3.4375 bpw
|
| 2583 |
-
#if QK_K == 64
|
| 2584 |
-
#define IQ3S_N_SCALE 2
|
| 2585 |
-
#else
|
| 2586 |
-
#define IQ3S_N_SCALE QK_K/64
|
| 2587 |
-
#endif
|
| 2588 |
-
typedef struct {
|
| 2589 |
-
half d;
|
| 2590 |
-
uint8_t qs[QK_K/4];
|
| 2591 |
-
uint8_t qh[QK_K/32];
|
| 2592 |
-
uint8_t signs[QK_K/8];
|
| 2593 |
-
uint8_t scales[IQ3S_N_SCALE];
|
| 2594 |
-
} block_iq3_s;
|
| 2595 |
-
|
| 2596 |
-
typedef struct {
|
| 2597 |
-
half d;
|
| 2598 |
-
uint8_t qs[QK_K/8];
|
| 2599 |
-
uint16_t qh[QK_K/32];
|
| 2600 |
-
} block_iq1_s;
|
| 2601 |
-
|
| 2602 |
-
// Non-linear quants
|
| 2603 |
-
#define QK4_NL 32
|
| 2604 |
-
typedef struct {
|
| 2605 |
-
half d;
|
| 2606 |
-
uint8_t qs[QK4_NL/2];
|
| 2607 |
-
} block_iq4_nl;
|
| 2608 |
-
|
| 2609 |
-
#if QK_K == 64
|
| 2610 |
-
#define block_iq4_xs block_iq4_nl
|
| 2611 |
-
#else
|
| 2612 |
-
typedef struct {
|
| 2613 |
-
half d;
|
| 2614 |
-
uint16_t scales_h;
|
| 2615 |
-
uint8_t scales_l[QK_K/64];
|
| 2616 |
-
uint8_t qs[QK_K/2];
|
| 2617 |
-
} block_iq4_xs;
|
| 2618 |
-
#endif
|
| 2619 |
-
|
| 2620 |
-
//====================================== dot products =========================
|
| 2621 |
-
|
| 2622 |
void kernel_mul_mv_q2_K_f32_impl(
|
| 2623 |
device const void * src0,
|
| 2624 |
device const float * src1,
|
|
|
|
| 1 |
+
#define GGML_COMMON_DECL_METAL
|
| 2 |
+
#define GGML_COMMON_IMPL_METAL
|
| 3 |
+
#include "ggml-common.h"
|
| 4 |
+
|
| 5 |
#include <metal_stdlib>
|
| 6 |
|
| 7 |
#define GGML_COMMON_IMPL_METAL
|
|
|
|
| 13 |
#define MIN(x, y) ((x) < (y) ? (x) : (y))
|
| 14 |
#define SWAP(x, y) { auto tmp = (x); (x) = (y); (y) = tmp; }
|
| 15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
#define N_SIMDWIDTH 32 // assuming SIMD group size is 32
|
| 17 |
|
| 18 |
enum ggml_sort_order {
|
|
|
|
| 2447 |
}
|
| 2448 |
}
|
| 2449 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2450 |
void kernel_mul_mv_q2_K_f32_impl(
|
| 2451 |
device const void * src0,
|
| 2452 |
device const float * src1,
|
ggml-quants.c
CHANGED
|
@@ -1,3 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
| 1 |
#include "ggml-quants.h"
|
| 2 |
#include "ggml-impl.h"
|
| 3 |
|
|
@@ -951,7 +954,7 @@ void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict
|
|
| 951 |
const float d = amax / ((1 << 7) - 1);
|
| 952 |
const float id = d ? 1.0f/d : 0.0f;
|
| 953 |
|
| 954 |
-
y[i].d = d;
|
| 955 |
|
| 956 |
int sum = 0;
|
| 957 |
|
|
@@ -966,7 +969,7 @@ void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict
|
|
| 966 |
sum += y[i].qs[QK8_1/2 + j];
|
| 967 |
}
|
| 968 |
|
| 969 |
-
y[i].s = sum*d;
|
| 970 |
}
|
| 971 |
}
|
| 972 |
|
|
@@ -994,7 +997,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
|
|
| 994 |
const float d = amax / ((1 << 7) - 1);
|
| 995 |
const float id = d ? 1.0f/d : 0.0f;
|
| 996 |
|
| 997 |
-
y[i].d = d;
|
| 998 |
|
| 999 |
int32x4_t accv = vdupq_n_s32(0);
|
| 1000 |
|
|
@@ -1010,7 +1013,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
|
|
| 1010 |
accv = vaddq_s32(accv, vi);
|
| 1011 |
}
|
| 1012 |
|
| 1013 |
-
y[i].s = d * vaddvq_s32(accv);
|
| 1014 |
}
|
| 1015 |
#elif defined(__wasm_simd128__)
|
| 1016 |
for (int i = 0; i < nb; i++) {
|
|
@@ -1033,7 +1036,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
|
|
| 1033 |
const float d = amax / ((1 << 7) - 1);
|
| 1034 |
const float id = d ? 1.0f/d : 0.0f;
|
| 1035 |
|
| 1036 |
-
y[i].d = d;
|
| 1037 |
|
| 1038 |
v128_t accv = wasm_i32x4_splat(0);
|
| 1039 |
|
|
@@ -1049,10 +1052,11 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
|
|
| 1049 |
accv = wasm_i32x4_add(accv, vi);
|
| 1050 |
}
|
| 1051 |
|
| 1052 |
-
y[i].s =
|
| 1053 |
-
|
| 1054 |
-
|
| 1055 |
-
|
|
|
|
| 1056 |
}
|
| 1057 |
#elif defined(__AVX2__) || defined(__AVX__)
|
| 1058 |
for (int i = 0; i < nb; i++) {
|
|
@@ -1077,7 +1081,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
|
|
| 1077 |
|
| 1078 |
// Quantize these floats
|
| 1079 |
const float d = maxScalar / 127.f;
|
| 1080 |
-
y[i].d = d;
|
| 1081 |
const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f;
|
| 1082 |
const __m256 mul = _mm256_set1_ps( id );
|
| 1083 |
|
|
@@ -1101,7 +1105,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
|
|
| 1101 |
|
| 1102 |
#if defined(__AVX2__)
|
| 1103 |
// Compute the sum of the quants and set y[i].s
|
| 1104 |
-
y[i].s = d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3)));
|
| 1105 |
|
| 1106 |
// Convert int32 to int16
|
| 1107 |
i0 = _mm256_packs_epi32( i0, i1 ); // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15
|
|
@@ -1131,7 +1135,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
|
|
| 1131 |
// Compute the sum of the quants and set y[i].s
|
| 1132 |
const __m128i s0 = _mm_add_epi32(_mm_add_epi32(ni0, ni1), _mm_add_epi32(ni2, ni3));
|
| 1133 |
const __m128i s1 = _mm_add_epi32(_mm_add_epi32(ni4, ni5), _mm_add_epi32(ni6, ni7));
|
| 1134 |
-
y[i].s = d * hsum_i32_4(_mm_add_epi32(s0, s1));
|
| 1135 |
|
| 1136 |
// Convert int32 to int16
|
| 1137 |
ni0 = _mm_packs_epi32( ni0, ni1 );
|
|
@@ -1162,7 +1166,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
|
|
| 1162 |
const float d = amax / ((1 << 7) - 1);
|
| 1163 |
const float id = d ? 1.0f/d : 0.0f;
|
| 1164 |
|
| 1165 |
-
y[i].d = d;
|
| 1166 |
|
| 1167 |
vfloat32m4_t x0 = __riscv_vfmul_vf_f32m4(v_x, id, vl);
|
| 1168 |
|
|
@@ -1179,7 +1183,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) {
|
|
| 1179 |
|
| 1180 |
// set y[i].s
|
| 1181 |
int sum = __riscv_vmv_x_s_i16m1_i16(vwrs);
|
| 1182 |
-
y[i].s = sum*d;
|
| 1183 |
}
|
| 1184 |
#else
|
| 1185 |
GGML_UNUSED(nb);
|
|
@@ -4019,10 +4023,10 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
| 4019 |
const block_q8_1 * restrict b_y0 = &vy0[i];
|
| 4020 |
const block_q8_1 * restrict b_y1 = &vy1[i];
|
| 4021 |
|
| 4022 |
-
float32x4_t summs_t = {GGML_FP16_TO_FP32(b_x0->m) * b_y0->s,
|
| 4023 |
-
GGML_FP16_TO_FP32(b_x1->m) * b_y0->s,
|
| 4024 |
-
GGML_FP16_TO_FP32(b_x0->m) * b_y1->s,
|
| 4025 |
-
GGML_FP16_TO_FP32(b_x1->m) * b_y1->s};
|
| 4026 |
summs0 += summs_t;
|
| 4027 |
|
| 4028 |
const uint8x16_t m4b = vdupq_n_u8(0x0F);
|
|
@@ -4087,7 +4091,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
| 4087 |
const block_q8_1 * restrict y0 = &y[i + 0];
|
| 4088 |
const block_q8_1 * restrict y1 = &y[i + 1];
|
| 4089 |
|
| 4090 |
-
summs += GGML_FP16_TO_FP32(x0->m) * y0->s + GGML_FP16_TO_FP32(x1->m) * y1->s;
|
| 4091 |
|
| 4092 |
const uint8x16_t m4b = vdupq_n_u8(0x0F);
|
| 4093 |
|
|
@@ -4110,8 +4114,8 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
| 4110 |
const int32x4_t p_0 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h);
|
| 4111 |
const int32x4_t p_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h);
|
| 4112 |
|
| 4113 |
-
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*y0->d);
|
| 4114 |
-
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*y1->d);
|
| 4115 |
}
|
| 4116 |
|
| 4117 |
*s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs;
|
|
@@ -4124,9 +4128,9 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
| 4124 |
// Main loop
|
| 4125 |
for (int i = 0; i < nb; ++i) {
|
| 4126 |
const float d0 = GGML_FP16_TO_FP32(x[i].d);
|
| 4127 |
-
const float d1 = y[i].d;
|
| 4128 |
|
| 4129 |
-
summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
|
| 4130 |
|
| 4131 |
const __m256 d0v = _mm256_set1_ps( d0 );
|
| 4132 |
const __m256 d1v = _mm256_set1_ps( d1 );
|
|
@@ -4178,7 +4182,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
| 4178 |
|
| 4179 |
int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
|
| 4180 |
|
| 4181 |
-
sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
|
| 4182 |
}
|
| 4183 |
|
| 4184 |
*s = sumf;
|
|
@@ -4196,7 +4200,7 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
| 4196 |
sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]);
|
| 4197 |
}
|
| 4198 |
|
| 4199 |
-
sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
|
| 4200 |
}
|
| 4201 |
|
| 4202 |
*s = sumf;
|
|
@@ -4532,8 +4536,8 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
| 4532 |
|
| 4533 |
const uint8x16_t m4b = vdupq_n_u8(0x0F);
|
| 4534 |
|
| 4535 |
-
summs0 += GGML_FP16_TO_FP32(x0->m) * y0->s;
|
| 4536 |
-
summs1 += GGML_FP16_TO_FP32(x1->m) * y1->s;
|
| 4537 |
|
| 4538 |
// extract the 5th bit via lookup table ((b) << 4)
|
| 4539 |
memcpy(&qh0, x0->qh, sizeof(qh0));
|
|
@@ -4577,10 +4581,10 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
| 4577 |
|
| 4578 |
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
|
| 4579 |
ggml_vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
|
| 4580 |
-
ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*y0->d);
|
| 4581 |
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
|
| 4582 |
ggml_vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
|
| 4583 |
-
ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*y1->d);
|
| 4584 |
}
|
| 4585 |
|
| 4586 |
*s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs0 + summs1;
|
|
@@ -4597,7 +4601,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
| 4597 |
const block_q5_1 * restrict x0 = &x[i];
|
| 4598 |
const block_q8_1 * restrict y0 = &y[i];
|
| 4599 |
|
| 4600 |
-
summs += GGML_FP16_TO_FP32(x0->m) * y0->s;
|
| 4601 |
|
| 4602 |
const v128_t m4b = wasm_i8x16_splat(0x0F);
|
| 4603 |
|
|
@@ -4644,7 +4648,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
| 4644 |
wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
|
| 4645 |
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
|
| 4646 |
wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
|
| 4647 |
-
wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * y0->d)));
|
| 4648 |
}
|
| 4649 |
|
| 4650 |
*s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
|
|
@@ -4659,14 +4663,14 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
| 4659 |
for (int i = 0; i < nb; i++) {
|
| 4660 |
const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d));
|
| 4661 |
|
| 4662 |
-
summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
|
| 4663 |
|
| 4664 |
__m256i qx = bytes_from_nibbles_32(x[i].qs);
|
| 4665 |
__m256i bxhi = bytes_from_bits_32(x[i].qh);
|
| 4666 |
bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10));
|
| 4667 |
qx = _mm256_or_si256(qx, bxhi);
|
| 4668 |
|
| 4669 |
-
const __m256 dy = _mm256_set1_ps(y[i].d);
|
| 4670 |
const __m256i qy = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
| 4671 |
|
| 4672 |
const __m256 q = mul_sum_us8_pairs_float(qx, qy);
|
|
@@ -4686,7 +4690,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
| 4686 |
for (int i = 0; i < nb; i++) {
|
| 4687 |
const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d));
|
| 4688 |
|
| 4689 |
-
summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
|
| 4690 |
|
| 4691 |
__m256i bx_0 = bytes_from_nibbles_32(x[i].qs);
|
| 4692 |
const __m256i bxhi = bytes_from_bits_32(x[i].qh);
|
|
@@ -4700,7 +4704,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
| 4700 |
bxh = _mm_or_si128(bxh, bxhih);
|
| 4701 |
bx_0 = MM256_SET_M128I(bxh, bxl);
|
| 4702 |
|
| 4703 |
-
const __m256 dy = _mm256_set1_ps(y[i].d);
|
| 4704 |
const __m256i by_0 = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
| 4705 |
|
| 4706 |
const __m256 q = mul_sum_us8_pairs_float(bx_0, by_0);
|
|
@@ -4767,7 +4771,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
| 4767 |
|
| 4768 |
int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
|
| 4769 |
|
| 4770 |
-
sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
|
| 4771 |
}
|
| 4772 |
|
| 4773 |
*s = sumf;
|
|
@@ -4791,7 +4795,7 @@ void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, size_t bs, const void * r
|
|
| 4791 |
sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
|
| 4792 |
}
|
| 4793 |
|
| 4794 |
-
sumf += (GGML_FP16_TO_FP32(x[i].d)*y[i].d)*sumi + GGML_FP16_TO_FP32(x[i].m)*y[i].s;
|
| 4795 |
}
|
| 4796 |
|
| 4797 |
*s = sumf;
|
|
|
|
| 1 |
+
#define GGML_COMMON_IMPL_C
|
| 2 |
+
#include "ggml-common.h"
|
| 3 |
+
|
| 4 |
#include "ggml-quants.h"
|
| 5 |
#include "ggml-impl.h"
|
| 6 |
|
|
|
|
| 954 |
const float d = amax / ((1 << 7) - 1);
|
| 955 |
const float id = d ? 1.0f/d : 0.0f;
|
| 956 |
|
| 957 |
+
y[i].d = GGML_FP32_TO_FP16(d);
|
| 958 |
|
| 959 |
int sum = 0;
|
| 960 |
|
|
|
|
| 969 |
sum += y[i].qs[QK8_1/2 + j];
|
| 970 |
}
|
| 971 |
|
| 972 |
+
y[i].s = GGML_FP32_TO_FP16(sum*d);
|
| 973 |
}
|
| 974 |
}
|
| 975 |
|
|
|
|
| 997 |
const float d = amax / ((1 << 7) - 1);
|
| 998 |
const float id = d ? 1.0f/d : 0.0f;
|
| 999 |
|
| 1000 |
+
y[i].d = GGML_FP32_TO_FP16(d);
|
| 1001 |
|
| 1002 |
int32x4_t accv = vdupq_n_s32(0);
|
| 1003 |
|
|
|
|
| 1013 |
accv = vaddq_s32(accv, vi);
|
| 1014 |
}
|
| 1015 |
|
| 1016 |
+
y[i].s = GGML_FP32_TO_FP16(d * vaddvq_s32(accv));
|
| 1017 |
}
|
| 1018 |
#elif defined(__wasm_simd128__)
|
| 1019 |
for (int i = 0; i < nb; i++) {
|
|
|
|
| 1036 |
const float d = amax / ((1 << 7) - 1);
|
| 1037 |
const float id = d ? 1.0f/d : 0.0f;
|
| 1038 |
|
| 1039 |
+
y[i].d = GGML_FP32_TO_FP16(d);
|
| 1040 |
|
| 1041 |
v128_t accv = wasm_i32x4_splat(0);
|
| 1042 |
|
|
|
|
| 1052 |
accv = wasm_i32x4_add(accv, vi);
|
| 1053 |
}
|
| 1054 |
|
| 1055 |
+
y[i].s = GGML_FP32_TO_FP16(
|
| 1056 |
+
d * (wasm_i32x4_extract_lane(accv, 0) +
|
| 1057 |
+
wasm_i32x4_extract_lane(accv, 1) +
|
| 1058 |
+
wasm_i32x4_extract_lane(accv, 2) +
|
| 1059 |
+
wasm_i32x4_extract_lane(accv, 3)));
|
| 1060 |
}
|
| 1061 |
#elif defined(__AVX2__) || defined(__AVX__)
|
| 1062 |
for (int i = 0; i < nb; i++) {
|
|
|
|
| 1081 |
|
| 1082 |
// Quantize these floats
|
| 1083 |
const float d = maxScalar / 127.f;
|
| 1084 |
+
y[i].d = GGML_FP32_TO_FP16(d);
|
| 1085 |
const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f;
|
| 1086 |
const __m256 mul = _mm256_set1_ps( id );
|
| 1087 |
|
|
|
|
| 1105 |
|
| 1106 |
#if defined(__AVX2__)
|
| 1107 |
// Compute the sum of the quants and set y[i].s
|
| 1108 |
+
y[i].s = GGML_FP32_TO_FP16(d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3))));
|
| 1109 |
|
| 1110 |
// Convert int32 to int16
|
| 1111 |
i0 = _mm256_packs_epi32( i0, i1 ); // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15
|
|
|
|
| 1135 |
// Compute the sum of the quants and set y[i].s
|
| 1136 |
const __m128i s0 = _mm_add_epi32(_mm_add_epi32(ni0, ni1), _mm_add_epi32(ni2, ni3));
|
| 1137 |
const __m128i s1 = _mm_add_epi32(_mm_add_epi32(ni4, ni5), _mm_add_epi32(ni6, ni7));
|
| 1138 |
+
y[i].s = GGML_FP32_TO_FP16(d * hsum_i32_4(_mm_add_epi32(s0, s1)));
|
| 1139 |
|
| 1140 |
// Convert int32 to int16
|
| 1141 |
ni0 = _mm_packs_epi32( ni0, ni1 );
|
|
|
|
| 1166 |
const float d = amax / ((1 << 7) - 1);
|
| 1167 |
const float id = d ? 1.0f/d : 0.0f;
|
| 1168 |
|
| 1169 |
+
y[i].d = GGML_FP32_TO_FP16(d);
|
| 1170 |
|
| 1171 |
vfloat32m4_t x0 = __riscv_vfmul_vf_f32m4(v_x, id, vl);
|
| 1172 |
|
|
|
|
| 1183 |
|
| 1184 |
// set y[i].s
|
| 1185 |
int sum = __riscv_vmv_x_s_i16m1_i16(vwrs);
|
| 1186 |
+
y[i].s = GGML_FP32_TO_FP16(sum*d);
|
| 1187 |
}
|
| 1188 |
#else
|
| 1189 |
GGML_UNUSED(nb);
|
|
|
|
| 4023 |
const block_q8_1 * restrict b_y0 = &vy0[i];
|
| 4024 |
const block_q8_1 * restrict b_y1 = &vy1[i];
|
| 4025 |
|
| 4026 |
+
float32x4_t summs_t = {GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y0->s),
|
| 4027 |
+
GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y0->s),
|
| 4028 |
+
GGML_FP16_TO_FP32(b_x0->m) * GGML_FP16_TO_FP32(b_y1->s),
|
| 4029 |
+
GGML_FP16_TO_FP32(b_x1->m) * GGML_FP16_TO_FP32(b_y1->s)};
|
| 4030 |
summs0 += summs_t;
|
| 4031 |
|
| 4032 |
const uint8x16_t m4b = vdupq_n_u8(0x0F);
|
|
|
|
| 4091 |
const block_q8_1 * restrict y0 = &y[i + 0];
|
| 4092 |
const block_q8_1 * restrict y1 = &y[i + 1];
|
| 4093 |
|
| 4094 |
+
summs += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->s) + GGML_FP16_TO_FP32(x1->m) * GGML_FP16_TO_FP32(y1->s);
|
| 4095 |
|
| 4096 |
const uint8x16_t m4b = vdupq_n_u8(0x0F);
|
| 4097 |
|
|
|
|
| 4114 |
const int32x4_t p_0 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h);
|
| 4115 |
const int32x4_t p_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h);
|
| 4116 |
|
| 4117 |
+
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
| 4118 |
+
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
| 4119 |
}
|
| 4120 |
|
| 4121 |
*s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs;
|
|
|
|
| 4128 |
// Main loop
|
| 4129 |
for (int i = 0; i < nb; ++i) {
|
| 4130 |
const float d0 = GGML_FP16_TO_FP32(x[i].d);
|
| 4131 |
+
const float d1 = GGML_FP16_TO_FP32(y[i].d);
|
| 4132 |
|
| 4133 |
+
summs += GGML_FP16_TO_FP32(x[i].m) * GGML_FP16_TO_FP32(y[i].s);
|
| 4134 |
|
| 4135 |
const __m256 d0v = _mm256_set1_ps( d0 );
|
| 4136 |
const __m256 d1v = _mm256_set1_ps( d1 );
|
|
|
|
| 4182 |
|
| 4183 |
int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
|
| 4184 |
|
| 4185 |
+
sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d))*sumi + GGML_FP16_TO_FP32(x[i].m)*GGML_FP16_TO_FP32(y[i].s);
|
| 4186 |
}
|
| 4187 |
|
| 4188 |
*s = sumf;
|
|
|
|
| 4200 |
sumi += (v0 * y[i].qs[j]) + (v1 * y[i].qs[j + qk/2]);
|
| 4201 |
}
|
| 4202 |
|
| 4203 |
+
sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d))*sumi + GGML_FP16_TO_FP32(x[i].m)*GGML_FP16_TO_FP32(y[i].s);
|
| 4204 |
}
|
| 4205 |
|
| 4206 |
*s = sumf;
|
|
|
|
| 4536 |
|
| 4537 |
const uint8x16_t m4b = vdupq_n_u8(0x0F);
|
| 4538 |
|
| 4539 |
+
summs0 += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->s);
|
| 4540 |
+
summs1 += GGML_FP16_TO_FP32(x1->m) * GGML_FP16_TO_FP32(y1->s);
|
| 4541 |
|
| 4542 |
// extract the 5th bit via lookup table ((b) << 4)
|
| 4543 |
memcpy(&qh0, x0->qh, sizeof(qh0));
|
|
|
|
| 4581 |
|
| 4582 |
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
|
| 4583 |
ggml_vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
|
| 4584 |
+
ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
| 4585 |
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
|
| 4586 |
ggml_vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
|
| 4587 |
+
ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
| 4588 |
}
|
| 4589 |
|
| 4590 |
*s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs0 + summs1;
|
|
|
|
| 4601 |
const block_q5_1 * restrict x0 = &x[i];
|
| 4602 |
const block_q8_1 * restrict y0 = &y[i];
|
| 4603 |
|
| 4604 |
+
summs += GGML_FP16_TO_FP32(x0->m) * GGML_FP16_TO_FP32(y0->s);
|
| 4605 |
|
| 4606 |
const v128_t m4b = wasm_i8x16_splat(0x0F);
|
| 4607 |
|
|
|
|
| 4648 |
wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
|
| 4649 |
wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
|
| 4650 |
wasm_i32x4_dot_i16x8(v0hfh, v1hh)))),
|
| 4651 |
+
wasm_f32x4_splat(GGML_FP16_TO_FP32(x0->d) * GGML_FP16_TO_FP32(y0->d))));
|
| 4652 |
}
|
| 4653 |
|
| 4654 |
*s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
|
|
|
|
| 4663 |
for (int i = 0; i < nb; i++) {
|
| 4664 |
const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d));
|
| 4665 |
|
| 4666 |
+
summs += GGML_FP16_TO_FP32(x[i].m) * GGML_FP16_TO_FP32(y[i].s);
|
| 4667 |
|
| 4668 |
__m256i qx = bytes_from_nibbles_32(x[i].qs);
|
| 4669 |
__m256i bxhi = bytes_from_bits_32(x[i].qh);
|
| 4670 |
bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10));
|
| 4671 |
qx = _mm256_or_si256(qx, bxhi);
|
| 4672 |
|
| 4673 |
+
const __m256 dy = _mm256_set1_ps(GGML_FP16_TO_FP32(y[i].d));
|
| 4674 |
const __m256i qy = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
| 4675 |
|
| 4676 |
const __m256 q = mul_sum_us8_pairs_float(qx, qy);
|
|
|
|
| 4690 |
for (int i = 0; i < nb; i++) {
|
| 4691 |
const __m256 dx = _mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d));
|
| 4692 |
|
| 4693 |
+
summs += GGML_FP16_TO_FP32(x[i].m) * GGML_FP16_TO_FP32(y[i].s);
|
| 4694 |
|
| 4695 |
__m256i bx_0 = bytes_from_nibbles_32(x[i].qs);
|
| 4696 |
const __m256i bxhi = bytes_from_bits_32(x[i].qh);
|
|
|
|
| 4704 |
bxh = _mm_or_si128(bxh, bxhih);
|
| 4705 |
bx_0 = MM256_SET_M128I(bxh, bxl);
|
| 4706 |
|
| 4707 |
+
const __m256 dy = _mm256_set1_ps(GGML_FP16_TO_FP32(y[i].d));
|
| 4708 |
const __m256i by_0 = _mm256_loadu_si256((const __m256i *)y[i].qs);
|
| 4709 |
|
| 4710 |
const __m256 q = mul_sum_us8_pairs_float(bx_0, by_0);
|
|
|
|
| 4771 |
|
| 4772 |
int sumi = __riscv_vmv_x_s_i32m1_i32(vs2);
|
| 4773 |
|
| 4774 |
+
sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d))*sumi + GGML_FP16_TO_FP32(x[i].m)*GGML_FP16_TO_FP32(y[i].s);
|
| 4775 |
}
|
| 4776 |
|
| 4777 |
*s = sumf;
|
|
|
|
| 4795 |
sumi += (x0 * y[i].qs[j]) + (x1 * y[i].qs[j + qk/2]);
|
| 4796 |
}
|
| 4797 |
|
| 4798 |
+
sumf += (GGML_FP16_TO_FP32(x[i].d)*GGML_FP16_TO_FP32(y[i].d))*sumi + GGML_FP16_TO_FP32(x[i].m)*GGML_FP16_TO_FP32(y[i].s);
|
| 4799 |
}
|
| 4800 |
|
| 4801 |
*s = sumf;
|
ggml-quants.h
CHANGED
|
@@ -1,247 +1,11 @@
|
|
| 1 |
#pragma once
|
| 2 |
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
#include "ggml-impl.h"
|
| 6 |
-
|
| 7 |
-
#include <stdint.h>
|
| 8 |
-
#include <stddef.h>
|
| 9 |
-
|
| 10 |
-
#define QK4_0 32
|
| 11 |
-
typedef struct {
|
| 12 |
-
ggml_fp16_t d; // delta
|
| 13 |
-
uint8_t qs[QK4_0 / 2]; // nibbles / quants
|
| 14 |
-
} block_q4_0;
|
| 15 |
-
static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
|
| 16 |
-
|
| 17 |
-
#define QK4_1 32
|
| 18 |
-
typedef struct {
|
| 19 |
-
ggml_fp16_t d; // delta
|
| 20 |
-
ggml_fp16_t m; // min
|
| 21 |
-
uint8_t qs[QK4_1 / 2]; // nibbles / quants
|
| 22 |
-
} block_q4_1;
|
| 23 |
-
static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_fp16_t) + QK4_1 / 2, "wrong q4_1 block size/padding");
|
| 24 |
-
|
| 25 |
-
#define QK5_0 32
|
| 26 |
-
typedef struct {
|
| 27 |
-
ggml_fp16_t d; // delta
|
| 28 |
-
uint8_t qh[4]; // 5-th bit of quants
|
| 29 |
-
uint8_t qs[QK5_0 / 2]; // nibbles / quants
|
| 30 |
-
} block_q5_0;
|
| 31 |
-
static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
|
| 32 |
-
|
| 33 |
-
#define QK5_1 32
|
| 34 |
-
typedef struct {
|
| 35 |
-
ggml_fp16_t d; // delta
|
| 36 |
-
ggml_fp16_t m; // min
|
| 37 |
-
uint8_t qh[4]; // 5-th bit of quants
|
| 38 |
-
uint8_t qs[QK5_1 / 2]; // nibbles / quants
|
| 39 |
-
} block_q5_1;
|
| 40 |
-
static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
|
| 41 |
-
|
| 42 |
-
#define QK8_0 32
|
| 43 |
-
typedef struct {
|
| 44 |
-
ggml_fp16_t d; // delta
|
| 45 |
-
int8_t qs[QK8_0]; // quants
|
| 46 |
-
} block_q8_0;
|
| 47 |
-
static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
|
| 48 |
-
|
| 49 |
-
#define QK8_1 32
|
| 50 |
-
typedef struct {
|
| 51 |
-
float d; // delta
|
| 52 |
-
float s; // d * sum(qs[i])
|
| 53 |
-
int8_t qs[QK8_1]; // quants
|
| 54 |
-
} block_q8_1;
|
| 55 |
-
static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block size/padding");
|
| 56 |
-
|
| 57 |
-
//
|
| 58 |
-
// Super-block quantization structures
|
| 59 |
-
//
|
| 60 |
-
|
| 61 |
-
// Super-block size
|
| 62 |
-
#ifdef GGML_QKK_64
|
| 63 |
-
#define QK_K 64
|
| 64 |
-
#define K_SCALE_SIZE 4
|
| 65 |
-
#else
|
| 66 |
-
#define QK_K 256
|
| 67 |
-
#define K_SCALE_SIZE 12
|
| 68 |
-
#endif
|
| 69 |
-
|
| 70 |
-
// 2-bit quantization
|
| 71 |
-
// weight is represented as x = a * q + b
|
| 72 |
-
// 16 blocks of 16 elements each
|
| 73 |
-
// Effectively 2.625 bits per weight
|
| 74 |
-
typedef struct {
|
| 75 |
-
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
|
| 76 |
-
uint8_t qs[QK_K/4]; // quants
|
| 77 |
-
ggml_fp16_t d; // super-block scale for quantized scales
|
| 78 |
-
ggml_fp16_t dmin; // super-block scale for quantized mins
|
| 79 |
-
} block_q2_K;
|
| 80 |
-
static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
|
| 81 |
-
|
| 82 |
-
// 3-bit quantization
|
| 83 |
-
// weight is represented as x = a * q
|
| 84 |
-
// 16 blocks of 16 elements each
|
| 85 |
-
// Effectively 3.4375 bits per weight
|
| 86 |
-
#ifdef GGML_QKK_64
|
| 87 |
-
typedef struct {
|
| 88 |
-
uint8_t hmask[QK_K/8]; // quants - high bit
|
| 89 |
-
uint8_t qs[QK_K/4]; // quants - low 2 bits
|
| 90 |
-
uint8_t scales[2];
|
| 91 |
-
ggml_fp16_t d; // super-block scale
|
| 92 |
-
} block_q3_K;
|
| 93 |
-
static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 2, "wrong q3_K block size/padding");
|
| 94 |
-
#else
|
| 95 |
-
typedef struct {
|
| 96 |
-
uint8_t hmask[QK_K/8]; // quants - high bit
|
| 97 |
-
uint8_t qs[QK_K/4]; // quants - low 2 bits
|
| 98 |
-
uint8_t scales[12]; // scales, quantized with 6 bits
|
| 99 |
-
ggml_fp16_t d; // super-block scale
|
| 100 |
-
} block_q3_K;
|
| 101 |
-
static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + 12, "wrong q3_K block size/padding");
|
| 102 |
-
#endif
|
| 103 |
|
| 104 |
-
|
| 105 |
-
// 8 blocks of 32 elements each
|
| 106 |
-
// weight is represented as x = a * q + b
|
| 107 |
-
// Effectively 4.5 bits per weight
|
| 108 |
-
#ifdef GGML_QKK_64
|
| 109 |
-
typedef struct {
|
| 110 |
-
ggml_fp16_t d[2]; // super-block scales/mins
|
| 111 |
-
uint8_t scales[2]; // 4-bit block scales/mins
|
| 112 |
-
uint8_t qs[QK_K/2]; // 4--bit quants
|
| 113 |
-
} block_q4_K;
|
| 114 |
-
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + QK_K/2 + 2, "wrong q4_K block size/padding");
|
| 115 |
-
#else
|
| 116 |
-
typedef struct {
|
| 117 |
-
ggml_fp16_t d; // super-block scale for quantized scales
|
| 118 |
-
ggml_fp16_t dmin; // super-block scale for quantized mins
|
| 119 |
-
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
|
| 120 |
-
uint8_t qs[QK_K/2]; // 4--bit quants
|
| 121 |
-
} block_q4_K;
|
| 122 |
-
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2, "wrong q4_K block size/padding");
|
| 123 |
-
#endif
|
| 124 |
-
|
| 125 |
-
// 5-bit quantization
|
| 126 |
-
// 8 blocks of 32 elements each
|
| 127 |
-
// weight is represented as x = a * q + b
|
| 128 |
-
// Effectively 5.5 bits per weight
|
| 129 |
-
#ifdef GGML_QKK_64
|
| 130 |
-
typedef struct {
|
| 131 |
-
ggml_fp16_t d; // super-block scale
|
| 132 |
-
int8_t scales[QK_K/16]; // 8-bit block scales
|
| 133 |
-
uint8_t qh[QK_K/8]; // quants, high bit
|
| 134 |
-
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
| 135 |
-
} block_q5_K;
|
| 136 |
-
static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
|
| 137 |
-
#else
|
| 138 |
-
typedef struct {
|
| 139 |
-
ggml_fp16_t d; // super-block scale for quantized scales
|
| 140 |
-
ggml_fp16_t dmin; // super-block scale for quantized mins
|
| 141 |
-
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
|
| 142 |
-
uint8_t qh[QK_K/8]; // quants, high bit
|
| 143 |
-
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
| 144 |
-
} block_q5_K;
|
| 145 |
-
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
|
| 146 |
-
#endif
|
| 147 |
-
|
| 148 |
-
// 6-bit quantization
|
| 149 |
-
// weight is represented as x = a * q
|
| 150 |
-
// 16 blocks of 16 elements each
|
| 151 |
-
// Effectively 6.5625 bits per weight
|
| 152 |
-
typedef struct {
|
| 153 |
-
uint8_t ql[QK_K/2]; // quants, lower 4 bits
|
| 154 |
-
uint8_t qh[QK_K/4]; // quants, upper 2 bits
|
| 155 |
-
int8_t scales[QK_K/16]; // scales, quantized with 8 bits
|
| 156 |
-
ggml_fp16_t d; // super-block scale
|
| 157 |
-
} block_q6_K;
|
| 158 |
-
static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + QK_K / 16 + 3*QK_K/4, "wrong q6_K block size/padding");
|
| 159 |
-
|
| 160 |
-
// This is only used for intermediate quantization and dot products
|
| 161 |
-
typedef struct {
|
| 162 |
-
float d; // delta
|
| 163 |
-
int8_t qs[QK_K]; // quants
|
| 164 |
-
int16_t bsums[QK_K/16]; // sum of quants in groups of 16
|
| 165 |
-
} block_q8_K;
|
| 166 |
-
static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_t), "wrong q8_K block size/padding");
|
| 167 |
-
|
| 168 |
-
// (Almost) "true" 2-bit quantization.
|
| 169 |
-
// Due to the need to use blocks as per ggml design, it ends up using
|
| 170 |
-
// 2.0625 bpw because of the 16-bit scale for each block of 256.
|
| 171 |
-
typedef struct {
|
| 172 |
-
ggml_fp16_t d;
|
| 173 |
-
uint16_t qs[QK_K/8];
|
| 174 |
-
} block_iq2_xxs;
|
| 175 |
-
static_assert(sizeof(block_iq2_xxs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t), "wrong iq2_xxs block size/padding");
|
| 176 |
-
|
| 177 |
-
// 2.3125 bpw quants
|
| 178 |
-
typedef struct {
|
| 179 |
-
ggml_fp16_t d;
|
| 180 |
-
uint16_t qs[QK_K/8];
|
| 181 |
-
uint8_t scales[QK_K/32];
|
| 182 |
-
} block_iq2_xs;
|
| 183 |
-
static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding");
|
| 184 |
-
|
| 185 |
-
// 2.5625 bpw quants
|
| 186 |
-
typedef struct {
|
| 187 |
-
ggml_fp16_t d;
|
| 188 |
-
uint8_t qs[QK_K/4];
|
| 189 |
-
uint8_t qh[QK_K/32];
|
| 190 |
-
uint8_t scales[QK_K/32];
|
| 191 |
-
} block_iq2_s;
|
| 192 |
-
static_assert(sizeof(block_iq2_s) == sizeof(ggml_fp16_t) + QK_K/4 + QK_K/16, "wrong iq2_s block size/padding");
|
| 193 |
|
| 194 |
-
//
|
| 195 |
-
// Due to the need to use blocks as per ggml design, it ends up using
|
| 196 |
-
// 3.0625 bpw because of the 16-bit scale for each block of 256.
|
| 197 |
-
typedef struct {
|
| 198 |
-
ggml_fp16_t d;
|
| 199 |
-
uint8_t qs[3*QK_K/8];
|
| 200 |
-
} block_iq3_xxs;
|
| 201 |
-
static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
|
| 202 |
-
|
| 203 |
-
// 3.4375 bpw
|
| 204 |
-
#if QK_K == 64
|
| 205 |
-
#define IQ3S_N_SCALE 2
|
| 206 |
-
#else
|
| 207 |
-
#define IQ3S_N_SCALE QK_K/64
|
| 208 |
-
#endif
|
| 209 |
-
typedef struct {
|
| 210 |
-
ggml_fp16_t d;
|
| 211 |
-
uint8_t qs[QK_K/4];
|
| 212 |
-
uint8_t qh[QK_K/32];
|
| 213 |
-
uint8_t signs[QK_K/8];
|
| 214 |
-
uint8_t scales[IQ3S_N_SCALE];
|
| 215 |
-
} block_iq3_s;
|
| 216 |
-
static_assert(sizeof(block_iq3_s) == sizeof(ggml_fp16_t) + 13*(QK_K/32) + IQ3S_N_SCALE, "wrong iq3_s block size/padding");
|
| 217 |
-
|
| 218 |
-
typedef struct {
|
| 219 |
-
ggml_fp16_t d;
|
| 220 |
-
uint8_t qs[QK_K/8];
|
| 221 |
-
uint16_t qh[QK_K/32];
|
| 222 |
-
} block_iq1_s;
|
| 223 |
-
static_assert(sizeof(block_iq1_s) == sizeof(ggml_fp16_t) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");
|
| 224 |
-
|
| 225 |
-
// Non-linear quants
|
| 226 |
-
#define QK4_NL 32
|
| 227 |
-
typedef struct {
|
| 228 |
-
ggml_fp16_t d;
|
| 229 |
-
uint8_t qs[QK4_NL/2];
|
| 230 |
-
} block_iq4_nl;
|
| 231 |
-
static_assert(sizeof(block_iq4_nl) == sizeof(ggml_fp16_t) + QK4_NL/2, "wrong iq4_nl block size/padding");
|
| 232 |
-
|
| 233 |
-
#if QK_K == 64
|
| 234 |
-
#define block_iq4_xs block_iq4_nl
|
| 235 |
-
//typedef struct block_iq4_nl block_iq4_xs;
|
| 236 |
-
#else
|
| 237 |
-
typedef struct {
|
| 238 |
-
ggml_fp16_t d;
|
| 239 |
-
uint16_t scales_h;
|
| 240 |
-
uint8_t scales_l[QK_K/64];
|
| 241 |
-
uint8_t qs[QK_K/2];
|
| 242 |
-
} block_iq4_xs;
|
| 243 |
-
static_assert(sizeof(block_iq4_xs) == sizeof(ggml_fp16_t) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
|
| 244 |
-
#endif
|
| 245 |
|
| 246 |
#ifdef __cplusplus
|
| 247 |
extern "C" {
|
|
|
|
| 1 |
#pragma once
|
| 2 |
|
| 3 |
+
#define GGML_COMMON_DECL_C
|
| 4 |
+
#include "ggml-common.h"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
+
#include "ggml.h"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
+
// GGML internal header
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
#ifdef __cplusplus
|
| 11 |
extern "C" {
|
ggml-sycl.cpp
CHANGED
|
@@ -3144,6 +3144,7 @@ namespace dpct
|
|
| 3144 |
|
| 3145 |
} // COPY from DPCT head files
|
| 3146 |
|
|
|
|
| 3147 |
#define GGML_COMMON_IMPL_SYCL
|
| 3148 |
#include "ggml-common.h"
|
| 3149 |
|
|
@@ -3312,66 +3313,6 @@ typedef void (*ggml_sycl_op_flatten_t)(const ggml_tensor *src0,
|
|
| 3312 |
const float *src1_dd, float *dst_dd,
|
| 3313 |
const dpct::queue_ptr &main_stream);
|
| 3314 |
|
| 3315 |
-
// QK = number of values after dequantization
|
| 3316 |
-
// QR = QK / number of values before dequantization
|
| 3317 |
-
// QI = number of 32 bit integers before dequantization
|
| 3318 |
-
|
| 3319 |
-
#define QK4_0 32
|
| 3320 |
-
#define QR4_0 2
|
| 3321 |
-
#define QI4_0 (QK4_0 / (4 * QR4_0))
|
| 3322 |
-
typedef struct dpct_type_block_q4_0 {
|
| 3323 |
-
sycl::half d; // delta
|
| 3324 |
-
uint8_t qs[QK4_0 / 2]; // nibbles / quants
|
| 3325 |
-
} block_q4_0;
|
| 3326 |
-
static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
|
| 3327 |
-
|
| 3328 |
-
#define QK4_1 32
|
| 3329 |
-
#define QR4_1 2
|
| 3330 |
-
#define QI4_1 (QK4_1 / (4 * QR4_1))
|
| 3331 |
-
typedef struct dpct_type_block_q4_1 {
|
| 3332 |
-
sycl::half2 dm; // dm.x = delta, dm.y = min
|
| 3333 |
-
uint8_t qs[QK4_1 / 2]; // nibbles / quants
|
| 3334 |
-
} block_q4_1;
|
| 3335 |
-
static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
|
| 3336 |
-
|
| 3337 |
-
#define QK5_0 32
|
| 3338 |
-
#define QR5_0 2
|
| 3339 |
-
#define QI5_0 (QK5_0 / (4 * QR5_0))
|
| 3340 |
-
typedef struct dpct_type_block_q5_0 {
|
| 3341 |
-
sycl::half d; // delta
|
| 3342 |
-
uint8_t qh[4]; // 5-th bit of quants
|
| 3343 |
-
uint8_t qs[QK5_0 / 2]; // nibbles / quants
|
| 3344 |
-
} block_q5_0;
|
| 3345 |
-
static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
|
| 3346 |
-
|
| 3347 |
-
#define QK5_1 32
|
| 3348 |
-
#define QR5_1 2
|
| 3349 |
-
#define QI5_1 (QK5_1 / (4 * QR5_1))
|
| 3350 |
-
typedef struct dpct_type_block_q5_1 {
|
| 3351 |
-
sycl::half2 dm; // dm.x = delta, dm.y = min
|
| 3352 |
-
uint8_t qh[4]; // 5-th bit of quants
|
| 3353 |
-
uint8_t qs[QK5_1 / 2]; // nibbles / quants
|
| 3354 |
-
} block_q5_1;
|
| 3355 |
-
static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
|
| 3356 |
-
|
| 3357 |
-
#define QK8_0 32
|
| 3358 |
-
#define QR8_0 1
|
| 3359 |
-
#define QI8_0 (QK8_0 / (4 * QR8_0))
|
| 3360 |
-
typedef struct dpct_type_block_q8_0 {
|
| 3361 |
-
sycl::half d; // delta
|
| 3362 |
-
int8_t qs[QK8_0]; // quants
|
| 3363 |
-
} block_q8_0;
|
| 3364 |
-
static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
|
| 3365 |
-
|
| 3366 |
-
#define QK8_1 32
|
| 3367 |
-
#define QR8_1 1
|
| 3368 |
-
#define QI8_1 (QK8_1 / (4 * QR8_1))
|
| 3369 |
-
typedef struct dpct_type_block_q8_1 {
|
| 3370 |
-
sycl::half2 ds; // ds.x = delta, ds.y = sum
|
| 3371 |
-
int8_t qs[QK8_0]; // quants
|
| 3372 |
-
} block_q8_1;
|
| 3373 |
-
static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding");
|
| 3374 |
-
|
| 3375 |
typedef float (*vec_dot_q_sycl_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs);
|
| 3376 |
typedef void (*allocate_tiles_sycl_t)(int **x_ql, sycl::half2 **x_dm,
|
| 3377 |
int **x_qh, int **x_sc);
|
|
@@ -3388,137 +3329,6 @@ typedef float (*vec_dot_q_mul_mat_sycl_t)(
|
|
| 3388 |
const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ms,
|
| 3389 |
const int &i, const int &j, const int &k);
|
| 3390 |
|
| 3391 |
-
//================================= k-quants
|
| 3392 |
-
|
| 3393 |
-
#ifdef GGML_QKK_64
|
| 3394 |
-
#define QK_K 64
|
| 3395 |
-
#define K_SCALE_SIZE 4
|
| 3396 |
-
#else
|
| 3397 |
-
#define QK_K 256
|
| 3398 |
-
#define K_SCALE_SIZE 12
|
| 3399 |
-
#endif
|
| 3400 |
-
|
| 3401 |
-
#define QR2_K 4
|
| 3402 |
-
#define QI2_K (QK_K / (4*QR2_K))
|
| 3403 |
-
typedef struct dpct_type_block_q2_K {
|
| 3404 |
-
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
|
| 3405 |
-
uint8_t qs[QK_K/4]; // quants
|
| 3406 |
-
sycl::half2 dm; // super-block scale for quantized scales/mins
|
| 3407 |
-
} block_q2_K;
|
| 3408 |
-
static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
|
| 3409 |
-
|
| 3410 |
-
#define QR3_K 4
|
| 3411 |
-
#define QI3_K (QK_K / (4*QR3_K))
|
| 3412 |
-
typedef struct dpct_type_block_q3_K {
|
| 3413 |
-
uint8_t hmask[QK_K/8]; // quants - high bit
|
| 3414 |
-
uint8_t qs[QK_K/4]; // quants - low 2 bits
|
| 3415 |
-
#ifdef GGML_QKK_64
|
| 3416 |
-
uint8_t scales[2]; // scales, quantized with 8 bits
|
| 3417 |
-
#else
|
| 3418 |
-
uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
|
| 3419 |
-
#endif
|
| 3420 |
-
sycl::half d; // super-block scale
|
| 3421 |
-
} block_q3_K;
|
| 3422 |
-
//static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + K_SCALE_SIZE, "wrong q3_K block size/padding");
|
| 3423 |
-
|
| 3424 |
-
#define QR4_K 2
|
| 3425 |
-
#define QI4_K (QK_K / (4*QR4_K))
|
| 3426 |
-
#ifdef GGML_QKK_64
|
| 3427 |
-
typedef struct {
|
| 3428 |
-
sycl::half dm[2]; // super-block scales/mins
|
| 3429 |
-
uint8_t scales[2]; // 4-bit block scales/mins
|
| 3430 |
-
uint8_t qs[QK_K/2]; // 4--bit quants
|
| 3431 |
-
} block_q4_K;
|
| 3432 |
-
static_assert(sizeof(block_q4_K) == sizeof(sycl::half2) + QK_K/2 + 2, "wrong q4_K block size/padding");
|
| 3433 |
-
#else
|
| 3434 |
-
typedef struct dpct_type_block_q4_K {
|
| 3435 |
-
sycl::half2 dm; // super-block scale for quantized scales/mins
|
| 3436 |
-
uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
|
| 3437 |
-
uint8_t qs[QK_K/2]; // 4--bit quants
|
| 3438 |
-
} block_q4_K;
|
| 3439 |
-
static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
|
| 3440 |
-
#endif
|
| 3441 |
-
|
| 3442 |
-
#define QR5_K 2
|
| 3443 |
-
#define QI5_K (QK_K / (4*QR5_K))
|
| 3444 |
-
#ifdef GGML_QKK_64
|
| 3445 |
-
typedef struct {
|
| 3446 |
-
sycl::half d; // super-block scale
|
| 3447 |
-
int8_t scales[QK_K/16]; // block scales
|
| 3448 |
-
uint8_t qh[QK_K/8]; // quants, high bit
|
| 3449 |
-
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
| 3450 |
-
} block_q5_K;
|
| 3451 |
-
static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
|
| 3452 |
-
#else
|
| 3453 |
-
typedef struct dpct_type_block_q5_K {
|
| 3454 |
-
sycl::half2 dm; // super-block scale for quantized scales/mins
|
| 3455 |
-
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
|
| 3456 |
-
uint8_t qh[QK_K/8]; // quants, high bit
|
| 3457 |
-
uint8_t qs[QK_K/2]; // quants, low 4 bits
|
| 3458 |
-
} block_q5_K;
|
| 3459 |
-
static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
|
| 3460 |
-
#endif
|
| 3461 |
-
|
| 3462 |
-
#define QR6_K 2
|
| 3463 |
-
#define QI6_K (QK_K / (4*QR6_K))
|
| 3464 |
-
typedef struct dpct_type_block_q6_K {
|
| 3465 |
-
uint8_t ql[QK_K/2]; // quants, lower 4 bits
|
| 3466 |
-
uint8_t qh[QK_K/4]; // quants, upper 2 bits
|
| 3467 |
-
int8_t scales[QK_K/16]; // scales
|
| 3468 |
-
sycl::half d; // delta
|
| 3469 |
-
} block_q6_K;
|
| 3470 |
-
static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding");
|
| 3471 |
-
|
| 3472 |
-
#define QR2_XXS 8
|
| 3473 |
-
#define QI2_XXS (QK_K / (4*QR2_XXS))
|
| 3474 |
-
typedef struct dpct_type_block_iq2_xxs {
|
| 3475 |
-
sycl::half d;
|
| 3476 |
-
uint16_t qs[QK_K/8];
|
| 3477 |
-
} block_iq2_xxs;
|
| 3478 |
-
static_assert(sizeof(block_iq2_xxs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t), "wrong iq2_xxs block size/padding");
|
| 3479 |
-
|
| 3480 |
-
#define QR2_XS 8
|
| 3481 |
-
#define QI2_XS (QK_K / (4*QR2_XS))
|
| 3482 |
-
typedef struct dpct_type_block_iq2_xs {
|
| 3483 |
-
sycl::half d;
|
| 3484 |
-
uint16_t qs[QK_K/8];
|
| 3485 |
-
uint8_t scales[QK_K/32];
|
| 3486 |
-
} block_iq2_xs;
|
| 3487 |
-
static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding");
|
| 3488 |
-
|
| 3489 |
-
#define QR3_XXS 8
|
| 3490 |
-
#define QI3_XXS (QK_K / (4*QR3_XXS))
|
| 3491 |
-
typedef struct dpct_type_block_iq3_xxs {
|
| 3492 |
-
sycl::half d;
|
| 3493 |
-
uint8_t qs[3*(QK_K/8)];
|
| 3494 |
-
} block_iq3_xxs;
|
| 3495 |
-
static_assert(sizeof(block_iq3_xxs) == sizeof(ggml_fp16_t) + 3*(QK_K/8), "wrong iq3_xxs block size/padding");
|
| 3496 |
-
|
| 3497 |
-
#define QR3_XS 8
|
| 3498 |
-
#define QI3_XS (QK_K / (4*QR3_XS))
|
| 3499 |
-
#if QK_K == 64
|
| 3500 |
-
#define IQ3S_N_SCALE 2
|
| 3501 |
-
#else
|
| 3502 |
-
#define IQ3S_N_SCALE QK_K/64
|
| 3503 |
-
#endif
|
| 3504 |
-
typedef struct {
|
| 3505 |
-
sycl::half d;
|
| 3506 |
-
uint8_t qs[QK_K/4];
|
| 3507 |
-
uint8_t qh[QK_K/32];
|
| 3508 |
-
uint8_t signs[QK_K/8];
|
| 3509 |
-
uint8_t scales[IQ3S_N_SCALE];
|
| 3510 |
-
} block_iq3_s;
|
| 3511 |
-
static_assert(sizeof(block_iq3_s) == sizeof(ggml_fp16_t) + 13*(QK_K/32) + IQ3S_N_SCALE, "wrong iq3_s block size/padding");
|
| 3512 |
-
|
| 3513 |
-
#define QR1_S 8
|
| 3514 |
-
#define QI1_S (QK_K / (4*QR1_S))
|
| 3515 |
-
typedef struct {
|
| 3516 |
-
sycl::half d;
|
| 3517 |
-
uint8_t qs[QK_K/8];
|
| 3518 |
-
uint16_t qh[QK_K/32];
|
| 3519 |
-
} block_iq1_s;
|
| 3520 |
-
static_assert(sizeof(block_iq1_s) == sizeof(ggml_fp16_t) + QK_K/8 + QK_K/16, "wrong iq1_s block size/padding");
|
| 3521 |
-
|
| 3522 |
#define WARP_SIZE 32
|
| 3523 |
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
| 3524 |
|
|
|
|
| 3144 |
|
| 3145 |
} // COPY from DPCT head files
|
| 3146 |
|
| 3147 |
+
#define GGML_COMMON_DECL_SYCL
|
| 3148 |
#define GGML_COMMON_IMPL_SYCL
|
| 3149 |
#include "ggml-common.h"
|
| 3150 |
|
|
|
|
| 3313 |
const float *src1_dd, float *dst_dd,
|
| 3314 |
const dpct::queue_ptr &main_stream);
|
| 3315 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3316 |
typedef float (*vec_dot_q_sycl_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs);
|
| 3317 |
typedef void (*allocate_tiles_sycl_t)(int **x_ql, sycl::half2 **x_dm,
|
| 3318 |
int **x_qh, int **x_sc);
|
|
|
|
| 3329 |
const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ms,
|
| 3330 |
const int &i, const int &j, const int &k);
|
| 3331 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3332 |
#define WARP_SIZE 32
|
| 3333 |
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
|
| 3334 |
|