Spaces:
Running
Running
2-bit quantizations (llama/4897)
Browse files* imatrix: load
* imatrix: WIP
* imatrix: Add Q2_K quantization
* imatrix: also guard against Q2_K_S quantization without importance matrix
* imatrix: guard even more against low-bit quantization misuse
---------
Co-authored-by: Iwan Kawrakow <[email protected]>
- ggml-quants.c +900 -50
- ggml-quants.h +8 -4
- ggml.c +25 -11
- ggml.h +6 -3
ggml-quants.c
CHANGED
|
@@ -5,6 +5,8 @@
|
|
| 5 |
#include <string.h>
|
| 6 |
#include <assert.h>
|
| 7 |
#include <float.h>
|
|
|
|
|
|
|
| 8 |
|
| 9 |
#ifdef __ARM_NEON
|
| 10 |
|
|
@@ -1639,6 +1641,241 @@ size_t ggml_quantize_q2_K(const float * restrict src, void * restrict dst, int n
|
|
| 1639 |
return (n/QK_K*sizeof(block_q2_K));
|
| 1640 |
}
|
| 1641 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1642 |
//========================= 3-bit (de)-quantization
|
| 1643 |
|
| 1644 |
void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k) {
|
|
@@ -2584,14 +2821,6 @@ static const uint8_t ksigns_iq2xs[128] = {
|
|
| 2584 |
|
| 2585 |
static const uint8_t kmask_iq2xs[8] = {1, 2, 4, 8, 16, 32, 64, 128};
|
| 2586 |
|
| 2587 |
-
void quantize_row_iq2_xxs_reference(const float * restrict x, block_iq2_xxs * restrict y, int k) {
|
| 2588 |
-
(void)x;
|
| 2589 |
-
(void)y;
|
| 2590 |
-
(void)k;
|
| 2591 |
-
assert(k % QK_K == 0);
|
| 2592 |
-
//fprintf(stderr, "=========================== %s: not implemented\n", __func__);
|
| 2593 |
-
}
|
| 2594 |
-
|
| 2595 |
void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k) {
|
| 2596 |
assert(k % QK_K == 0);
|
| 2597 |
const int nb = k / QK_K;
|
|
@@ -2618,33 +2847,8 @@ void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y
|
|
| 2618 |
}
|
| 2619 |
}
|
| 2620 |
|
| 2621 |
-
void quantize_row_iq2_xxs(const float * restrict x, void * restrict vy, int k) {
|
| 2622 |
-
assert(k % QK_K == 0);
|
| 2623 |
-
block_iq2_xxs * restrict y = vy;
|
| 2624 |
-
quantize_row_iq2_xxs_reference(x, y, k);
|
| 2625 |
-
}
|
| 2626 |
-
|
| 2627 |
-
size_t ggml_quantize_iq2_xxs(const float * src, void * dst, int n, int k, int64_t * hist) {
|
| 2628 |
-
assert(k % QK_K == 0);
|
| 2629 |
-
(void)hist; // TODO: collect histograms
|
| 2630 |
-
|
| 2631 |
-
for (int j = 0; j < n; j += k) {
|
| 2632 |
-
block_iq2_xxs * restrict y = (block_iq2_xxs *)dst + j/QK_K;
|
| 2633 |
-
quantize_row_iq2_xxs_reference(src + j, y, k);
|
| 2634 |
-
}
|
| 2635 |
-
return (n/QK_K*sizeof(block_iq2_xxs));
|
| 2636 |
-
}
|
| 2637 |
-
|
| 2638 |
// ====================== 2.3125 bpw (de)-quantization
|
| 2639 |
|
| 2640 |
-
void quantize_row_iq2_xs_reference(const float * restrict x, block_iq2_xs * restrict y, int k) {
|
| 2641 |
-
(void)x;
|
| 2642 |
-
(void)y;
|
| 2643 |
-
(void)k;
|
| 2644 |
-
assert(k % QK_K == 0);
|
| 2645 |
-
//fprintf(stderr, "=========================== %s: not implemented\n", __func__);
|
| 2646 |
-
}
|
| 2647 |
-
|
| 2648 |
void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y, int k) {
|
| 2649 |
assert(k % QK_K == 0);
|
| 2650 |
const int nb = k / QK_K;
|
|
@@ -2670,23 +2874,6 @@ void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y,
|
|
| 2670 |
}
|
| 2671 |
}
|
| 2672 |
|
| 2673 |
-
void quantize_row_iq2_xs(const float * restrict x, void * restrict vy, int k) {
|
| 2674 |
-
assert(k % QK_K == 0);
|
| 2675 |
-
block_iq2_xs * restrict y = vy;
|
| 2676 |
-
quantize_row_iq2_xs_reference(x, y, k);
|
| 2677 |
-
}
|
| 2678 |
-
|
| 2679 |
-
size_t ggml_quantize_iq2_xs(const float * src, void * dst, int n, int k, int64_t * hist) {
|
| 2680 |
-
assert(k % QK_K == 0);
|
| 2681 |
-
(void)hist; // TODO: collect histograms
|
| 2682 |
-
|
| 2683 |
-
for (int j = 0; j < n; j += k) {
|
| 2684 |
-
block_iq2_xs * restrict y = (block_iq2_xs *)dst + j/QK_K;
|
| 2685 |
-
quantize_row_iq2_xs_reference(src + j, y, k);
|
| 2686 |
-
}
|
| 2687 |
-
return (n/QK_K*sizeof(block_iq2_xs));
|
| 2688 |
-
}
|
| 2689 |
-
|
| 2690 |
//===================================== Q8_K ==============================================
|
| 2691 |
|
| 2692 |
void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k) {
|
|
@@ -7730,3 +7917,666 @@ void ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * rest
|
|
| 7730 |
*s = 0.125f * sumf;
|
| 7731 |
#endif
|
| 7732 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
#include <string.h>
|
| 6 |
#include <assert.h>
|
| 7 |
#include <float.h>
|
| 8 |
+
#include <stdlib.h> // for qsort
|
| 9 |
+
#include <stdio.h> // for GGML_ASSERT
|
| 10 |
|
| 11 |
#ifdef __ARM_NEON
|
| 12 |
|
|
|
|
| 1641 |
return (n/QK_K*sizeof(block_q2_K));
|
| 1642 |
}
|
| 1643 |
|
| 1644 |
+
static float make_qkx3_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
|
| 1645 |
+
uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux,
|
| 1646 |
+
float rmin, float rdelta, int nstep, bool use_mad) {
|
| 1647 |
+
float min = x[0];
|
| 1648 |
+
float max = x[0];
|
| 1649 |
+
float sum_w = weights ? weights[0] : x[0]*x[0];
|
| 1650 |
+
float sum_x = sum_w * x[0];
|
| 1651 |
+
for (int i = 1; i < n; ++i) {
|
| 1652 |
+
if (x[i] < min) min = x[i];
|
| 1653 |
+
if (x[i] > max) max = x[i];
|
| 1654 |
+
float w = weights ? weights[i] : x[i]*x[i];
|
| 1655 |
+
sum_w += w;
|
| 1656 |
+
sum_x += w * x[i];
|
| 1657 |
+
}
|
| 1658 |
+
if (min > 0) {
|
| 1659 |
+
min = 0;
|
| 1660 |
+
}
|
| 1661 |
+
if (max <= min) {
|
| 1662 |
+
for (int i = 0; i < n; ++i) L[i] = 0;
|
| 1663 |
+
*the_min = -min;
|
| 1664 |
+
return 0.f;
|
| 1665 |
+
}
|
| 1666 |
+
float iscale = nmax/(max - min);
|
| 1667 |
+
float scale = 1/iscale;
|
| 1668 |
+
float best_mad = 0;
|
| 1669 |
+
for (int i = 0; i < n; ++i) {
|
| 1670 |
+
int l = nearest_int(iscale*(x[i] - min));
|
| 1671 |
+
L[i] = MAX(0, MIN(nmax, l));
|
| 1672 |
+
float diff = scale * L[i] + min - x[i];
|
| 1673 |
+
diff = use_mad ? fabsf(diff) : diff*diff;
|
| 1674 |
+
float w = weights ? weights[i] : x[i]*x[i];
|
| 1675 |
+
best_mad += w * diff;
|
| 1676 |
+
}
|
| 1677 |
+
if (nstep < 1) {
|
| 1678 |
+
*the_min = -min;
|
| 1679 |
+
return scale;
|
| 1680 |
+
}
|
| 1681 |
+
for (int is = 0; is <= nstep; ++is) {
|
| 1682 |
+
iscale = (rmin + rdelta*is + nmax)/(max - min);
|
| 1683 |
+
float sum_l = 0, sum_l2 = 0, sum_xl = 0;
|
| 1684 |
+
for (int i = 0; i < n; ++i) {
|
| 1685 |
+
int l = nearest_int(iscale*(x[i] - min));
|
| 1686 |
+
l = MAX(0, MIN(nmax, l));
|
| 1687 |
+
Laux[i] = l;
|
| 1688 |
+
float w = weights ? weights[i] : x[i]*x[i];
|
| 1689 |
+
sum_l += w*l;
|
| 1690 |
+
sum_l2 += w*l*l;
|
| 1691 |
+
sum_xl += w*l*x[i];
|
| 1692 |
+
}
|
| 1693 |
+
float D = sum_w * sum_l2 - sum_l * sum_l;
|
| 1694 |
+
if (D > 0) {
|
| 1695 |
+
float this_scale = (sum_w * sum_xl - sum_x * sum_l)/D;
|
| 1696 |
+
float this_min = (sum_l2 * sum_x - sum_l * sum_xl)/D;
|
| 1697 |
+
if (this_min > 0) {
|
| 1698 |
+
this_min = 0;
|
| 1699 |
+
this_scale = sum_xl / sum_l2;
|
| 1700 |
+
}
|
| 1701 |
+
float mad = 0;
|
| 1702 |
+
for (int i = 0; i < n; ++i) {
|
| 1703 |
+
float diff = this_scale * Laux[i] + this_min - x[i];
|
| 1704 |
+
diff = use_mad ? fabsf(diff) : diff*diff;
|
| 1705 |
+
float w = weights ? weights[i] : x[i]*x[i];
|
| 1706 |
+
mad += w * diff;
|
| 1707 |
+
}
|
| 1708 |
+
if (mad < best_mad) {
|
| 1709 |
+
for (int i = 0; i < n; ++i) {
|
| 1710 |
+
L[i] = Laux[i];
|
| 1711 |
+
}
|
| 1712 |
+
best_mad = mad;
|
| 1713 |
+
scale = this_scale;
|
| 1714 |
+
min = this_min;
|
| 1715 |
+
}
|
| 1716 |
+
}
|
| 1717 |
+
}
|
| 1718 |
+
*the_min = -min;
|
| 1719 |
+
return scale;
|
| 1720 |
+
}
|
| 1721 |
+
|
| 1722 |
+
static float make_qp_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, const float * quant_weights) {
|
| 1723 |
+
float max = 0;
|
| 1724 |
+
for (int i = 0; i < n; ++i) {
|
| 1725 |
+
max = MAX(max, x[i]);
|
| 1726 |
+
}
|
| 1727 |
+
if (!max) { // all zero
|
| 1728 |
+
for (int i = 0; i < n; ++i) { L[i] = 0; }
|
| 1729 |
+
return 0.f;
|
| 1730 |
+
}
|
| 1731 |
+
float iscale = nmax / max;
|
| 1732 |
+
for (int i = 0; i < n; ++i) {
|
| 1733 |
+
L[i] = nearest_int(iscale * x[i]);
|
| 1734 |
+
}
|
| 1735 |
+
float scale = 1/iscale;
|
| 1736 |
+
float best_mse = 0;
|
| 1737 |
+
for (int i = 0; i < n; ++i) {
|
| 1738 |
+
float diff = x[i] - scale*L[i];
|
| 1739 |
+
float w = quant_weights[i];
|
| 1740 |
+
best_mse += w*diff*diff;
|
| 1741 |
+
}
|
| 1742 |
+
for (int is = -4; is <= 4; ++is) {
|
| 1743 |
+
if (is == 0) continue;
|
| 1744 |
+
float iscale_is = (0.1f*is + nmax)/max;
|
| 1745 |
+
float scale_is = 1/iscale_is;
|
| 1746 |
+
float mse = 0;
|
| 1747 |
+
for (int i = 0; i < n; ++i) {
|
| 1748 |
+
int l = nearest_int(iscale_is*x[i]);
|
| 1749 |
+
l = MIN(nmax, l);
|
| 1750 |
+
float diff = x[i] - scale_is*l;
|
| 1751 |
+
float w = quant_weights[i];
|
| 1752 |
+
mse += w*diff*diff;
|
| 1753 |
+
}
|
| 1754 |
+
if (mse < best_mse) {
|
| 1755 |
+
best_mse = mse;
|
| 1756 |
+
iscale = iscale_is;
|
| 1757 |
+
}
|
| 1758 |
+
}
|
| 1759 |
+
float sumlx = 0;
|
| 1760 |
+
float suml2 = 0;
|
| 1761 |
+
for (int i = 0; i < n; ++i) {
|
| 1762 |
+
int l = nearest_int(iscale * x[i]);
|
| 1763 |
+
l = MIN(nmax, l);
|
| 1764 |
+
L[i] = l;
|
| 1765 |
+
float w = quant_weights[i];
|
| 1766 |
+
sumlx += w*x[i]*l;
|
| 1767 |
+
suml2 += w*l*l;
|
| 1768 |
+
}
|
| 1769 |
+
for (int itry = 0; itry < 5; ++itry) {
|
| 1770 |
+
int n_changed = 0;
|
| 1771 |
+
for (int i = 0; i < n; ++i) {
|
| 1772 |
+
float w = quant_weights[i];
|
| 1773 |
+
float slx = sumlx - w*x[i]*L[i];
|
| 1774 |
+
float sl2 = suml2 - w*L[i]*L[i];
|
| 1775 |
+
if (slx > 0 && sl2 > 0) {
|
| 1776 |
+
int new_l = nearest_int(x[i] * sl2 / slx);
|
| 1777 |
+
new_l = MIN(nmax, new_l);
|
| 1778 |
+
if (new_l != L[i]) {
|
| 1779 |
+
slx += w*x[i]*new_l;
|
| 1780 |
+
sl2 += w*new_l*new_l;
|
| 1781 |
+
if (slx*slx*suml2 > sumlx*sumlx*sl2) {
|
| 1782 |
+
L[i] = new_l; sumlx = slx; suml2 = sl2;
|
| 1783 |
+
++n_changed;
|
| 1784 |
+
}
|
| 1785 |
+
}
|
| 1786 |
+
}
|
| 1787 |
+
}
|
| 1788 |
+
if (!n_changed) {
|
| 1789 |
+
break;
|
| 1790 |
+
}
|
| 1791 |
+
}
|
| 1792 |
+
return sumlx / suml2;
|
| 1793 |
+
}
|
| 1794 |
+
|
| 1795 |
+
static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restrict y, int k, const float * restrict quant_weights) {
|
| 1796 |
+
GGML_ASSERT(quant_weights);
|
| 1797 |
+
assert(k % QK_K == 0);
|
| 1798 |
+
const int nb = k / QK_K;
|
| 1799 |
+
const bool requantize = true;
|
| 1800 |
+
|
| 1801 |
+
uint8_t L[QK_K];
|
| 1802 |
+
uint8_t Laux[16];
|
| 1803 |
+
float mins[QK_K/16];
|
| 1804 |
+
float scales[QK_K/16];
|
| 1805 |
+
float sw[QK_K/16];
|
| 1806 |
+
float weight[QK_K/16];
|
| 1807 |
+
uint8_t Ls[QK_K/16], Lm[QK_K/16];
|
| 1808 |
+
|
| 1809 |
+
for (int i = 0; i < nb; i++) {
|
| 1810 |
+
memset(sw, 0, QK_K/16*sizeof(float));
|
| 1811 |
+
float sumx2 = 0;
|
| 1812 |
+
for (int j = 0; j < QK_K; ++j) sumx2 += x[j]*x[j];
|
| 1813 |
+
float sigma2 = sumx2/QK_K;
|
| 1814 |
+
for (int j = 0; j < QK_K/16; ++j) {
|
| 1815 |
+
const float * restrict qw = quant_weights + QK_K * i + 16*j;
|
| 1816 |
+
for (int l = 0; l < 16; ++l) weight[l] = qw[l] * sqrtf(sigma2 + x[16*j + l]*x[16*j + l]);
|
| 1817 |
+
for (int l = 0; l < 16; ++l) sw[j] += weight[l];
|
| 1818 |
+
scales[j] = make_qkx3_quants(16, 3, x + 16*j, weight, L + 16*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
|
| 1819 |
+
}
|
| 1820 |
+
|
| 1821 |
+
float dm = make_qp_quants(QK_K/16, 15, scales, Ls, sw);
|
| 1822 |
+
float mm = make_qp_quants(QK_K/16, 15, mins, Lm, sw);
|
| 1823 |
+
y[i].d = GGML_FP32_TO_FP16(dm);
|
| 1824 |
+
y[i].dmin = GGML_FP32_TO_FP16(mm);
|
| 1825 |
+
dm = GGML_FP16_TO_FP32(y[i].d);
|
| 1826 |
+
mm = GGML_FP16_TO_FP32(y[i].dmin);
|
| 1827 |
+
|
| 1828 |
+
for (int j = 0; j < QK_K/16; ++j) {
|
| 1829 |
+
y[i].scales[j] = Ls[j] | (Lm[j] << 4);
|
| 1830 |
+
}
|
| 1831 |
+
|
| 1832 |
+
if (requantize) {
|
| 1833 |
+
for (int j = 0; j < QK_K/16; ++j) {
|
| 1834 |
+
const float d = dm * (y[i].scales[j] & 0xF);
|
| 1835 |
+
if (!d) continue;
|
| 1836 |
+
const float m = mm * (y[i].scales[j] >> 4);
|
| 1837 |
+
for (int ii = 0; ii < 16; ++ii) {
|
| 1838 |
+
int l = nearest_int((x[16*j + ii] + m)/d);
|
| 1839 |
+
l = MAX(0, MIN(3, l));
|
| 1840 |
+
L[16*j + ii] = l;
|
| 1841 |
+
}
|
| 1842 |
+
}
|
| 1843 |
+
}
|
| 1844 |
+
|
| 1845 |
+
#if QK_K == 256
|
| 1846 |
+
for (int j = 0; j < QK_K; j += 128) {
|
| 1847 |
+
for (int l = 0; l < 32; ++l) {
|
| 1848 |
+
y[i].qs[j/4 + l] = L[j + l] | (L[j + l + 32] << 2) | (L[j + l + 64] << 4) | (L[j + l + 96] << 6);
|
| 1849 |
+
}
|
| 1850 |
+
}
|
| 1851 |
+
#else
|
| 1852 |
+
for (int l = 0; l < 16; ++l) {
|
| 1853 |
+
y[i].qs[l] = L[l] | (L[l + 16] << 2) | (L[l + 32] << 4) | (L[l + 48] << 6);
|
| 1854 |
+
}
|
| 1855 |
+
#endif
|
| 1856 |
+
|
| 1857 |
+
x += QK_K;
|
| 1858 |
+
|
| 1859 |
+
}
|
| 1860 |
+
}
|
| 1861 |
+
|
| 1862 |
+
size_t quantize_q2_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
| 1863 |
+
(void)hist;
|
| 1864 |
+
int row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row);
|
| 1865 |
+
if (!quant_weights) {
|
| 1866 |
+
quantize_row_q2_K_reference(src, dst, nrow*n_per_row);
|
| 1867 |
+
}
|
| 1868 |
+
else {
|
| 1869 |
+
char * qrow = (char *)dst;
|
| 1870 |
+
for (int row = 0; row < nrow; ++row) {
|
| 1871 |
+
quantize_row_q2_K_impl(src, (block_q2_K*)qrow, n_per_row, quant_weights);
|
| 1872 |
+
src += n_per_row;
|
| 1873 |
+
qrow += row_size;
|
| 1874 |
+
}
|
| 1875 |
+
}
|
| 1876 |
+
return nrow * row_size;
|
| 1877 |
+
}
|
| 1878 |
+
|
| 1879 |
//========================= 3-bit (de)-quantization
|
| 1880 |
|
| 1881 |
void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k) {
|
|
|
|
| 2821 |
|
| 2822 |
static const uint8_t kmask_iq2xs[8] = {1, 2, 4, 8, 16, 32, 64, 128};
|
| 2823 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2824 |
void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k) {
|
| 2825 |
assert(k % QK_K == 0);
|
| 2826 |
const int nb = k / QK_K;
|
|
|
|
| 2847 |
}
|
| 2848 |
}
|
| 2849 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2850 |
// ====================== 2.3125 bpw (de)-quantization
|
| 2851 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2852 |
void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y, int k) {
|
| 2853 |
assert(k % QK_K == 0);
|
| 2854 |
const int nb = k / QK_K;
|
|
|
|
| 2874 |
}
|
| 2875 |
}
|
| 2876 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2877 |
//===================================== Q8_K ==============================================
|
| 2878 |
|
| 2879 |
void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k) {
|
|
|
|
| 7917 |
*s = 0.125f * sumf;
|
| 7918 |
#endif
|
| 7919 |
}
|
| 7920 |
+
|
| 7921 |
+
// ================================ IQ2 quantization =============================================
|
| 7922 |
+
|
| 7923 |
+
typedef struct {
|
| 7924 |
+
uint64_t * grid;
|
| 7925 |
+
int * map;
|
| 7926 |
+
uint16_t * neighbours;
|
| 7927 |
+
} iq2_entry_t;
|
| 7928 |
+
|
| 7929 |
+
static iq2_entry_t iq2_data[2] = {
|
| 7930 |
+
{NULL, NULL, NULL},
|
| 7931 |
+
{NULL, NULL, NULL},
|
| 7932 |
+
};
|
| 7933 |
+
|
| 7934 |
+
static inline int iq2_data_index(int grid_size) {
|
| 7935 |
+
GGML_ASSERT(grid_size == 256 || grid_size == 512);
|
| 7936 |
+
return grid_size == 256 ? 0 : 1;
|
| 7937 |
+
}
|
| 7938 |
+
|
| 7939 |
+
static int iq2_compare_func(const void * left, const void * right) {
|
| 7940 |
+
const int * l = (const int *)left;
|
| 7941 |
+
const int * r = (const int *)right;
|
| 7942 |
+
return l[0] < r[0] ? -1 : l[0] > r[0] ? 1 : l[1] < r[1] ? -1 : l[1] > r[1] ? 1 : 0;
|
| 7943 |
+
}
|
| 7944 |
+
|
| 7945 |
+
static void q2xs_init_impl(int grid_size) {
|
| 7946 |
+
const int gindex = iq2_data_index(grid_size);
|
| 7947 |
+
if (iq2_data[gindex].grid) {
|
| 7948 |
+
return;
|
| 7949 |
+
}
|
| 7950 |
+
static const uint16_t kgrid_256[256] = {
|
| 7951 |
+
0, 2, 5, 8, 10, 17, 20, 32, 34, 40, 42, 65, 68, 80, 88, 97,
|
| 7952 |
+
100, 128, 130, 138, 162, 257, 260, 272, 277, 320, 388, 408, 512, 514, 546, 642,
|
| 7953 |
+
1025, 1028, 1040, 1057, 1060, 1088, 1090, 1096, 1120, 1153, 1156, 1168, 1188, 1280, 1282, 1288,
|
| 7954 |
+
1312, 1350, 1385, 1408, 1425, 1545, 1552, 1600, 1668, 1700, 2048, 2053, 2056, 2068, 2088, 2113,
|
| 7955 |
+
2116, 2128, 2130, 2184, 2308, 2368, 2562, 2580, 4097, 4100, 4112, 4129, 4160, 4192, 4228, 4240,
|
| 7956 |
+
4245, 4352, 4360, 4384, 4432, 4442, 4480, 4644, 4677, 5120, 5128, 5152, 5157, 5193, 5248, 5400,
|
| 7957 |
+
5474, 5632, 5654, 6145, 6148, 6160, 6208, 6273, 6400, 6405, 6560, 6737, 8192, 8194, 8202, 8260,
|
| 7958 |
+
8289, 8320, 8322, 8489, 8520, 8704, 8706, 9217, 9220, 9232, 9280, 9302, 9472, 9537, 9572, 9872,
|
| 7959 |
+
10248, 10272, 10388, 10820, 16385, 16388, 16400, 16408, 16417, 16420, 16448, 16456, 16470, 16480, 16513, 16516,
|
| 7960 |
+
16528, 16640, 16672, 16737, 16768, 16773, 16897, 16912, 16968, 16982, 17000, 17408, 17416, 17440, 17536, 17561,
|
| 7961 |
+
17682, 17700, 17920, 18433, 18436, 18448, 18496, 18501, 18688, 18776, 18785, 18818, 19013, 19088, 20480, 20488,
|
| 7962 |
+
20497, 20505, 20512, 20608, 20616, 20740, 20802, 20900, 21137, 21648, 21650, 21770, 22017, 22100, 22528, 22545,
|
| 7963 |
+
22553, 22628, 22848, 23048, 24580, 24592, 24640, 24680, 24832, 24917, 25112, 25184, 25600, 25605, 25872, 25874,
|
| 7964 |
+
25988, 26690, 32768, 32770, 32778, 32833, 32898, 33028, 33048, 33088, 33297, 33793, 33796, 33808, 33813, 33856,
|
| 7965 |
+
33888, 34048, 34118, 34196, 34313, 34368, 34400, 34818, 35076, 35345, 36868, 36880, 36900, 36928, 37025, 37142,
|
| 7966 |
+
37248, 37445, 37888, 37922, 37956, 38225, 39041, 39200, 40962, 41040, 41093, 41225, 41472, 42008, 43088, 43268,
|
| 7967 |
+
};
|
| 7968 |
+
static const uint16_t kgrid_512[512] = {
|
| 7969 |
+
0, 2, 5, 8, 10, 17, 20, 22, 25, 32, 34, 37, 40, 65, 68, 70,
|
| 7970 |
+
73, 80, 82, 85, 88, 97, 100, 128, 130, 133, 136, 145, 148, 153, 160, 257,
|
| 7971 |
+
260, 262, 265, 272, 274, 277, 280, 282, 289, 292, 320, 322, 325, 328, 337, 340,
|
| 7972 |
+
352, 360, 385, 388, 400, 512, 514, 517, 520, 529, 532, 544, 577, 580, 592, 597,
|
| 7973 |
+
640, 650, 1025, 1028, 1030, 1033, 1040, 1042, 1045, 1048, 1057, 1060, 1088, 1090, 1093, 1096,
|
| 7974 |
+
1105, 1108, 1110, 1120, 1153, 1156, 1168, 1280, 1282, 1285, 1288, 1297, 1300, 1312, 1345, 1348,
|
| 7975 |
+
1360, 1377, 1408, 1537, 1540, 1552, 1574, 1600, 1602, 1668, 2048, 2050, 2053, 2056, 2058, 2065,
|
| 7976 |
+
2068, 2080, 2085, 2113, 2116, 2128, 2136, 2176, 2208, 2218, 2305, 2308, 2320, 2368, 2433, 2441,
|
| 7977 |
+
2560, 2592, 2600, 2710, 2720, 4097, 4100, 4102, 4105, 4112, 4114, 4117, 4120, 4129, 4132, 4160,
|
| 7978 |
+
4162, 4165, 4168, 4177, 4180, 4192, 4202, 4225, 4228, 4240, 4352, 4354, 4357, 4360, 4369, 4372,
|
| 7979 |
+
4384, 4417, 4420, 4432, 4480, 4500, 4502, 4609, 4612, 4614, 4624, 4672, 4704, 5120, 5122, 5125,
|
| 7980 |
+
5128, 5137, 5140, 5152, 5185, 5188, 5193, 5200, 5220, 5248, 5377, 5380, 5392, 5440, 5632, 5652,
|
| 7981 |
+
5705, 6145, 6148, 6160, 6162, 6208, 6228, 6278, 6400, 6405, 6502, 6737, 6825, 8192, 8194, 8197,
|
| 7982 |
+
8200, 8202, 8209, 8212, 8224, 8257, 8260, 8272, 8320, 8352, 8449, 8452, 8464, 8512, 8520, 8549,
|
| 7983 |
+
8704, 8738, 8832, 8872, 9217, 9220, 9232, 9257, 9280, 9472, 9537, 9554, 9625, 9729, 9754, 9894,
|
| 7984 |
+
10240, 10248, 10250, 10272, 10325, 10376, 10402, 10600, 10640, 10760, 10784, 10882, 10888, 10890, 16385, 16388,
|
| 7985 |
+
16390, 16393, 16400, 16402, 16405, 16408, 16417, 16420, 16448, 16450, 16453, 16456, 16458, 16465, 16468, 16480,
|
| 7986 |
+
16485, 16513, 16516, 16528, 16640, 16642, 16645, 16648, 16657, 16660, 16672, 16705, 16708, 16720, 16768, 16773,
|
| 7987 |
+
16802, 16897, 16900, 16912, 16914, 16937, 16960, 17408, 17410, 17413, 17416, 17425, 17428, 17433, 17440, 17473,
|
| 7988 |
+
17476, 17488, 17536, 17556, 17665, 17668, 17680, 17700, 17728, 17818, 17920, 17930, 17988, 18000, 18433, 18436,
|
| 7989 |
+
18448, 18496, 18501, 18516, 18530, 18688, 18705, 18756, 18768, 18793, 18948, 20480, 20482, 20485, 20488, 20497,
|
| 7990 |
+
20500, 20512, 20520, 20545, 20548, 20560, 20608, 20737, 20740, 20752, 20757, 20800, 20802, 20992, 21060, 21162,
|
| 7991 |
+
21505, 21508, 21520, 21537, 21568, 21600, 21633, 21665, 21760, 21768, 21888, 21896, 22049, 22120, 22177, 22528,
|
| 7992 |
+
22548, 22593, 22608, 22681, 22810, 22848, 22850, 23173, 24577, 24580, 24592, 24640, 24660, 24674, 24710, 24745,
|
| 7993 |
+
24832, 25124, 25162, 25234, 25600, 25622, 25872, 25920, 25925, 26020, 26625, 26730, 26917, 27142, 27220, 27234,
|
| 7994 |
+
32768, 32770, 32773, 32776, 32785, 32788, 32800, 32810, 32833, 32836, 32848, 32896, 32898, 32936, 32938, 33025,
|
| 7995 |
+
33028, 33030, 33040, 33088, 33105, 33113, 33280, 33312, 33408, 33410, 33440, 33448, 33793, 33796, 33808, 33810,
|
| 7996 |
+
33813, 33856, 33888, 33929, 34048, 34116, 34213, 34328, 34410, 34816, 34824, 34853, 34906, 34944, 34946, 34984,
|
| 7997 |
+
35078, 35362, 35456, 35464, 35478, 35496, 36865, 36868, 36880, 36928, 36950, 36996, 37120, 37154, 37220, 37462,
|
| 7998 |
+
37513, 37888, 37893, 37956, 37968, 37976, 38185, 38288, 38290, 38465, 38993, 39078, 39241, 39445, 39520, 40960,
|
| 7999 |
+
40962, 40968, 40970, 40992, 41002, 41120, 41297, 41305, 41382, 41472, 41474, 41480, 41514, 41600, 41632, 42048,
|
| 8000 |
+
42133, 42597, 42648, 43018, 43040, 43042, 43048, 43168, 43176, 43268, 43396, 43398, 43560, 43562, 43665, 43690,
|
| 8001 |
+
};
|
| 8002 |
+
const int kmap_size = 43692;
|
| 8003 |
+
const int nwant = 2;
|
| 8004 |
+
const uint16_t * kgrid = grid_size == 256 ? kgrid_256 : kgrid_512;
|
| 8005 |
+
uint64_t * kgrid_q2xs;
|
| 8006 |
+
int * kmap_q2xs;
|
| 8007 |
+
uint16_t * kneighbors_q2xs;
|
| 8008 |
+
|
| 8009 |
+
printf("================================================================= %s(grid_size = %d)\n", __func__, grid_size);
|
| 8010 |
+
uint64_t * the_grid = (uint64_t *)malloc(grid_size*sizeof(uint64_t));
|
| 8011 |
+
for (int k = 0; k < grid_size; ++k) {
|
| 8012 |
+
int8_t * pos = (int8_t *)(the_grid + k);
|
| 8013 |
+
for (int i = 0; i < 8; ++i) {
|
| 8014 |
+
int l = (kgrid[k] >> 2*i) & 0x3;
|
| 8015 |
+
pos[i] = 2*l + 1;
|
| 8016 |
+
}
|
| 8017 |
+
}
|
| 8018 |
+
kgrid_q2xs = the_grid;
|
| 8019 |
+
iq2_data[gindex].grid = the_grid;
|
| 8020 |
+
kmap_q2xs = (int *)malloc(kmap_size*sizeof(int));
|
| 8021 |
+
iq2_data[gindex].map = kmap_q2xs;
|
| 8022 |
+
for (int i = 0; i < kmap_size; ++i) kmap_q2xs[i] = -1;
|
| 8023 |
+
uint64_t aux64;
|
| 8024 |
+
uint8_t * aux8 = (uint8_t *)&aux64;
|
| 8025 |
+
for (int i = 0; i < grid_size; ++i) {
|
| 8026 |
+
aux64 = kgrid_q2xs[i];
|
| 8027 |
+
uint16_t index = 0;
|
| 8028 |
+
for (int k=0; k<8; ++k) {
|
| 8029 |
+
uint16_t q = (aux8[k] - 1)/2;
|
| 8030 |
+
index |= (q << 2*k);
|
| 8031 |
+
}
|
| 8032 |
+
kmap_q2xs[index] = i;
|
| 8033 |
+
}
|
| 8034 |
+
int8_t pos[8];
|
| 8035 |
+
int * dist2 = (int *)malloc(2*grid_size*sizeof(int));
|
| 8036 |
+
int num_neighbors = 0, num_not_in_map = 0;
|
| 8037 |
+
for (int i = 0; i < kmap_size; ++i) {
|
| 8038 |
+
if (kmap_q2xs[i] >= 0) continue;
|
| 8039 |
+
++num_not_in_map;
|
| 8040 |
+
for (int k = 0; k < 8; ++k) {
|
| 8041 |
+
int l = (i >> 2*k) & 0x3;
|
| 8042 |
+
pos[k] = 2*l + 1;
|
| 8043 |
+
}
|
| 8044 |
+
for (int j = 0; j < grid_size; ++j) {
|
| 8045 |
+
const int8_t * pg = (const int8_t *)(kgrid_q2xs + j);
|
| 8046 |
+
int d2 = 0;
|
| 8047 |
+
for (int k = 0; k < 8; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
|
| 8048 |
+
dist2[2*j+0] = d2;
|
| 8049 |
+
dist2[2*j+1] = j;
|
| 8050 |
+
}
|
| 8051 |
+
qsort(dist2, grid_size, 2*sizeof(int), iq2_compare_func);
|
| 8052 |
+
int n = 0; int d2 = dist2[0];
|
| 8053 |
+
int nhave = 1;
|
| 8054 |
+
for (int j = 0; j < grid_size; ++j) {
|
| 8055 |
+
if (dist2[2*j] > d2) {
|
| 8056 |
+
if (nhave == nwant) break;
|
| 8057 |
+
d2 = dist2[2*j];
|
| 8058 |
+
++nhave;
|
| 8059 |
+
}
|
| 8060 |
+
++n;
|
| 8061 |
+
}
|
| 8062 |
+
num_neighbors += n;
|
| 8063 |
+
}
|
| 8064 |
+
printf("%s: %d neighbours in total\n", __func__, num_neighbors);
|
| 8065 |
+
kneighbors_q2xs = (uint16_t *)malloc((num_neighbors + num_not_in_map)*sizeof(uint16_t));
|
| 8066 |
+
iq2_data[gindex].neighbours = kneighbors_q2xs;
|
| 8067 |
+
int counter = 0;
|
| 8068 |
+
for (int i = 0; i < kmap_size; ++i) {
|
| 8069 |
+
if (kmap_q2xs[i] >= 0) continue;
|
| 8070 |
+
for (int k = 0; k < 8; ++k) {
|
| 8071 |
+
int l = (i >> 2*k) & 0x3;
|
| 8072 |
+
pos[k] = 2*l + 1;
|
| 8073 |
+
}
|
| 8074 |
+
for (int j = 0; j < grid_size; ++j) {
|
| 8075 |
+
const int8_t * pg = (const int8_t *)(kgrid_q2xs + j);
|
| 8076 |
+
int d2 = 0;
|
| 8077 |
+
for (int k = 0; k < 8; ++k) d2 += (pg[k] - pos[k])*(pg[k] - pos[k]);
|
| 8078 |
+
dist2[2*j+0] = d2;
|
| 8079 |
+
dist2[2*j+1] = j;
|
| 8080 |
+
}
|
| 8081 |
+
qsort(dist2, grid_size, 2*sizeof(int), iq2_compare_func);
|
| 8082 |
+
kmap_q2xs[i] = -(counter + 1);
|
| 8083 |
+
int d2 = dist2[0];
|
| 8084 |
+
uint16_t * start = &kneighbors_q2xs[counter++];
|
| 8085 |
+
int n = 0, nhave = 1;
|
| 8086 |
+
for (int j = 0; j < grid_size; ++j) {
|
| 8087 |
+
if (dist2[2*j] > d2) {
|
| 8088 |
+
if (nhave == nwant) break;
|
| 8089 |
+
d2 = dist2[2*j];
|
| 8090 |
+
++nhave;
|
| 8091 |
+
}
|
| 8092 |
+
kneighbors_q2xs[counter++] = dist2[2*j+1];
|
| 8093 |
+
++n;
|
| 8094 |
+
}
|
| 8095 |
+
*start = n;
|
| 8096 |
+
}
|
| 8097 |
+
free(dist2);
|
| 8098 |
+
}
|
| 8099 |
+
|
| 8100 |
+
void ggml_init_iq2_quantization(enum ggml_type type) {
|
| 8101 |
+
if (type == GGML_TYPE_IQ2_XXS) {
|
| 8102 |
+
q2xs_init_impl(256);
|
| 8103 |
+
}
|
| 8104 |
+
else if (type == GGML_TYPE_IQ2_XS) {
|
| 8105 |
+
q2xs_init_impl(512);
|
| 8106 |
+
}
|
| 8107 |
+
else {
|
| 8108 |
+
fprintf(stderr, "======================== Why are you calling %s with type %d?\n", __func__, (int)type);
|
| 8109 |
+
}
|
| 8110 |
+
}
|
| 8111 |
+
|
| 8112 |
+
static void q2xs_deinit_impl(int grid_size) {
|
| 8113 |
+
GGML_ASSERT(grid_size == 256 || grid_size == 512 || grid_size == 1024);
|
| 8114 |
+
const int gindex = iq2_data_index(grid_size);
|
| 8115 |
+
if (iq2_data[gindex].grid) {
|
| 8116 |
+
free(iq2_data[gindex].grid); iq2_data[gindex].grid = NULL;
|
| 8117 |
+
free(iq2_data[gindex].map); iq2_data[gindex].map = NULL;
|
| 8118 |
+
free(iq2_data[gindex].neighbours); iq2_data[gindex].neighbours = NULL;
|
| 8119 |
+
}
|
| 8120 |
+
}
|
| 8121 |
+
|
| 8122 |
+
void ggml_deinit_iq2_quantization(enum ggml_type type) {
|
| 8123 |
+
if (type == GGML_TYPE_IQ2_XXS) {
|
| 8124 |
+
q2xs_deinit_impl(256);
|
| 8125 |
+
}
|
| 8126 |
+
else if (type == GGML_TYPE_IQ2_XS) {
|
| 8127 |
+
q2xs_deinit_impl(512);
|
| 8128 |
+
}
|
| 8129 |
+
else {
|
| 8130 |
+
fprintf(stderr, "======================== Why are you calling %s with type %d?\n", __func__, (int)type);
|
| 8131 |
+
}
|
| 8132 |
+
}
|
| 8133 |
+
|
| 8134 |
+
static int iq2_find_best_neighbour(const uint16_t * restrict neighbours, const uint64_t * restrict grid,
|
| 8135 |
+
const float * restrict xval, const float * restrict weight, float scale, int8_t * restrict L) {
|
| 8136 |
+
int num_neighbors = neighbours[0];
|
| 8137 |
+
GGML_ASSERT(num_neighbors > 0);
|
| 8138 |
+
float best_d2 = FLT_MAX;
|
| 8139 |
+
int grid_index = -1;
|
| 8140 |
+
for (int j = 1; j <= num_neighbors; ++j) {
|
| 8141 |
+
const int8_t * pg = (const int8_t *)(grid + neighbours[j]);
|
| 8142 |
+
float d2 = 0;
|
| 8143 |
+
for (int i = 0; i < 8; ++i) {
|
| 8144 |
+
float q = pg[i];
|
| 8145 |
+
float diff = scale*q - xval[i];
|
| 8146 |
+
d2 += weight[i]*diff*diff;
|
| 8147 |
+
}
|
| 8148 |
+
if (d2 < best_d2) {
|
| 8149 |
+
best_d2 = d2; grid_index = neighbours[j];
|
| 8150 |
+
}
|
| 8151 |
+
}
|
| 8152 |
+
GGML_ASSERT(grid_index >= 0);
|
| 8153 |
+
const int8_t * pg = (const int8_t *)(grid + grid_index);
|
| 8154 |
+
for (int i = 0; i < 8; ++i) L[i] = (pg[i] - 1)/2;
|
| 8155 |
+
return grid_index;
|
| 8156 |
+
}
|
| 8157 |
+
|
| 8158 |
+
static void quantize_row_iq2_xxs_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
|
| 8159 |
+
|
| 8160 |
+
const int gindex = iq2_data_index(256);
|
| 8161 |
+
|
| 8162 |
+
const uint64_t * kgrid_q2xs = iq2_data[gindex].grid;
|
| 8163 |
+
const int * kmap_q2xs = iq2_data[gindex].map;
|
| 8164 |
+
const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
|
| 8165 |
+
|
| 8166 |
+
GGML_ASSERT(quant_weights);
|
| 8167 |
+
GGML_ASSERT(kgrid_q2xs);
|
| 8168 |
+
GGML_ASSERT(kmap_q2xs);
|
| 8169 |
+
GGML_ASSERT(kneighbors_q2xs);
|
| 8170 |
+
GGML_ASSERT(n%QK_K == 0);
|
| 8171 |
+
|
| 8172 |
+
const int kMaxQ = 3;
|
| 8173 |
+
|
| 8174 |
+
const int nbl = n/256;
|
| 8175 |
+
|
| 8176 |
+
block_iq2_xxs * y = vy;
|
| 8177 |
+
|
| 8178 |
+
float scales[QK_K/32];
|
| 8179 |
+
float weight[32];
|
| 8180 |
+
float xval[32];
|
| 8181 |
+
int8_t L[32];
|
| 8182 |
+
int8_t Laux[32];
|
| 8183 |
+
float waux[32];
|
| 8184 |
+
bool is_on_grid[4];
|
| 8185 |
+
bool is_on_grid_aux[4];
|
| 8186 |
+
uint8_t block_signs[4];
|
| 8187 |
+
uint32_t q2[2*(QK_K/32)];
|
| 8188 |
+
|
| 8189 |
+
for (int ibl = 0; ibl < nbl; ++ibl) {
|
| 8190 |
+
|
| 8191 |
+
y[ibl].d = GGML_FP32_TO_FP16(0.f);
|
| 8192 |
+
memset(q2, 0, QK_K/4);
|
| 8193 |
+
|
| 8194 |
+
float max_scale = 0;
|
| 8195 |
+
|
| 8196 |
+
const float * xbl = x + QK_K*ibl;
|
| 8197 |
+
float sumx2 = 0;
|
| 8198 |
+
for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
|
| 8199 |
+
float sigma2 = sumx2/QK_K;
|
| 8200 |
+
|
| 8201 |
+
for (int ib = 0; ib < QK_K/32; ++ib) {
|
| 8202 |
+
const float * xb = xbl + 32*ib;
|
| 8203 |
+
const float * qw = quant_weights + QK_K*ibl + 32*ib;
|
| 8204 |
+
for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
|
| 8205 |
+
for (int i = 0; i < 32; ++i) waux[i] = sqrtf(weight[i]);
|
| 8206 |
+
for (int k = 0; k < 4; ++k) {
|
| 8207 |
+
int nflip = 0;
|
| 8208 |
+
uint8_t s = 0;
|
| 8209 |
+
for (int i = 0; i < 8; ++i) {
|
| 8210 |
+
if (xb[8*k + i] >= 0) xval[8*k + i] = xb[8*k + i];
|
| 8211 |
+
else {
|
| 8212 |
+
xval[8*k + i] = -xb[8*k + i]; ++nflip; s |= (1 << i);
|
| 8213 |
+
}
|
| 8214 |
+
}
|
| 8215 |
+
if (nflip%2) {
|
| 8216 |
+
int imin = 0; float min = weight[8*k+imin]*xb[8*k+imin]*xb[8*k+imin];
|
| 8217 |
+
for (int i = 1; i < 8; ++i) {
|
| 8218 |
+
float ax = weight[8*k+i]*xb[8*k+i]*xb[8*k+i];
|
| 8219 |
+
if (ax < min) {
|
| 8220 |
+
min = ax; imin = i;
|
| 8221 |
+
}
|
| 8222 |
+
}
|
| 8223 |
+
xval[8*k+imin] = -xval[8*k+imin];
|
| 8224 |
+
s ^= (1 << imin);
|
| 8225 |
+
}
|
| 8226 |
+
block_signs[k] = s & 127;
|
| 8227 |
+
}
|
| 8228 |
+
float max = xval[0];
|
| 8229 |
+
for (int i = 1; i < 32; ++i) max = MAX(max, xval[i]);
|
| 8230 |
+
if (!max) {
|
| 8231 |
+
scales[ib] = 0;
|
| 8232 |
+
memset(L, 0, 32);
|
| 8233 |
+
continue;
|
| 8234 |
+
}
|
| 8235 |
+
float best = 0;
|
| 8236 |
+
float scale = max/(2*kMaxQ-1);
|
| 8237 |
+
for (int is = -9; is <= 9; ++is) {
|
| 8238 |
+
float id = (2*kMaxQ-1+is*0.1f)/max;
|
| 8239 |
+
float this_scale = 1/id;
|
| 8240 |
+
for (int k = 0; k < 4; ++k) {
|
| 8241 |
+
for (int i = 0; i < 8; ++i) {
|
| 8242 |
+
int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
|
| 8243 |
+
Laux[8*k+i] = MAX(0, MIN(kMaxQ-1, l));
|
| 8244 |
+
}
|
| 8245 |
+
uint16_t u = 0;
|
| 8246 |
+
for (int i = 0; i < 8; ++i) u |= (Laux[8*k+i] << 2*i);
|
| 8247 |
+
int grid_index = kmap_q2xs[u];
|
| 8248 |
+
is_on_grid_aux[k] = true;
|
| 8249 |
+
if (grid_index < 0) {
|
| 8250 |
+
is_on_grid_aux[k] = false;
|
| 8251 |
+
const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
|
| 8252 |
+
grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, this_scale, Laux + 8*k);
|
| 8253 |
+
}
|
| 8254 |
+
}
|
| 8255 |
+
float sumqx = 0, sumq2 = 0;
|
| 8256 |
+
for (int i = 0; i < 32; ++i) {
|
| 8257 |
+
float w = weight[i];
|
| 8258 |
+
float q = 2*Laux[i] + 1;
|
| 8259 |
+
sumqx += w*xval[i]*q;
|
| 8260 |
+
sumq2 += w*q*q;
|
| 8261 |
+
}
|
| 8262 |
+
if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
|
| 8263 |
+
scale = sumqx/sumq2; best = scale*sumqx;
|
| 8264 |
+
for (int i = 0; i < 32; ++i) L[i] = Laux[i];
|
| 8265 |
+
for (int k = 0; k < 4; ++k) is_on_grid[k] = is_on_grid_aux[k];
|
| 8266 |
+
}
|
| 8267 |
+
}
|
| 8268 |
+
int n_not_ongrid = 0;
|
| 8269 |
+
for (int k = 0; k < 4; ++k) if (!is_on_grid[k]) ++n_not_ongrid;
|
| 8270 |
+
if (n_not_ongrid > 0 && scale > 0) {
|
| 8271 |
+
float id = 1/scale;
|
| 8272 |
+
for (int k = 0; k < 4; ++k) {
|
| 8273 |
+
if (is_on_grid[k]) continue;
|
| 8274 |
+
uint16_t u = 0;
|
| 8275 |
+
for (int i = 0; i < 8; ++i) {
|
| 8276 |
+
int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
|
| 8277 |
+
l = MAX(0, MIN(kMaxQ-1, l));
|
| 8278 |
+
u |= (l << 2*i);
|
| 8279 |
+
}
|
| 8280 |
+
int grid_index = kmap_q2xs[u];
|
| 8281 |
+
if (grid_index < 0) {
|
| 8282 |
+
const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
|
| 8283 |
+
grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, scale, L + 8*k);
|
| 8284 |
+
}
|
| 8285 |
+
const int8_t * pg = (const int8_t *)(kgrid_q2xs + grid_index);
|
| 8286 |
+
for (int i = 0; i < 8; ++i) L[8*k+i] = (pg[i] - 1)/2;
|
| 8287 |
+
}
|
| 8288 |
+
float sumqx = 0, sumq2 = 0;
|
| 8289 |
+
for (int i = 0; i < 32; ++i) {
|
| 8290 |
+
float w = weight[i];
|
| 8291 |
+
float q = 2*L[i] + 1;
|
| 8292 |
+
sumqx += w*xval[i]*q;
|
| 8293 |
+
sumq2 += w*q*q;
|
| 8294 |
+
}
|
| 8295 |
+
if (sumq2 > 0) scale = sumqx/sumq2;
|
| 8296 |
+
}
|
| 8297 |
+
if (scale < 0) {
|
| 8298 |
+
// This should never happen, but just in case, flip scale so that it is positive (we use uint's to encode the scale)
|
| 8299 |
+
// and correspondingly flip quant signs.
|
| 8300 |
+
scale = -scale;
|
| 8301 |
+
for (int k = 0; k < 4; ++k) block_signs[k] = (~block_signs[k]) & 127;
|
| 8302 |
+
}
|
| 8303 |
+
for (int k = 0; k < 4; ++k) {
|
| 8304 |
+
uint16_t u = 0;
|
| 8305 |
+
for (int i = 0; i < 8; ++i) u |= (L[8*k+i] << 2*i);
|
| 8306 |
+
int grid_index = kmap_q2xs[u];
|
| 8307 |
+
if (grid_index < 0) {
|
| 8308 |
+
printf("Oops: found point %u not on grid:", u);
|
| 8309 |
+
for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]);
|
| 8310 |
+
printf("\n");
|
| 8311 |
+
GGML_ASSERT(false);
|
| 8312 |
+
}
|
| 8313 |
+
q2[2*ib+0] |= (grid_index << 8*k);
|
| 8314 |
+
q2[2*ib+1] |= (block_signs[k] << 7*k);
|
| 8315 |
+
}
|
| 8316 |
+
GGML_ASSERT(scale >= 0);
|
| 8317 |
+
scales[ib] = scale;
|
| 8318 |
+
max_scale = MAX(max_scale, scale);
|
| 8319 |
+
}
|
| 8320 |
+
|
| 8321 |
+
if (!max_scale) {
|
| 8322 |
+
memset(y[ibl].qs, 0, QK_K/4);
|
| 8323 |
+
continue;
|
| 8324 |
+
}
|
| 8325 |
+
|
| 8326 |
+
float d = max_scale/31;
|
| 8327 |
+
y[ibl].d = GGML_FP32_TO_FP16(d);
|
| 8328 |
+
float id = 1/d;
|
| 8329 |
+
float sumqx = 0, sumq2 = 0;
|
| 8330 |
+
for (int ib = 0; ib < QK_K/32; ++ib) {
|
| 8331 |
+
int l = nearest_int(0.5f*(id*scales[ib]-1));
|
| 8332 |
+
l = MAX(0, MIN(15, l));
|
| 8333 |
+
q2[2*ib+1] |= ((uint32_t)l << 28);
|
| 8334 |
+
const float * xb = xbl + 32*ib;
|
| 8335 |
+
const float * qw = quant_weights + QK_K*ibl + 32*ib;
|
| 8336 |
+
for (int i = 0; i < 32; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
|
| 8337 |
+
const uint8_t * aux8 = (const uint8_t *)(q2 + 2*ib);
|
| 8338 |
+
const float db = d * (1 + 2*l);
|
| 8339 |
+
uint32_t u = 0;
|
| 8340 |
+
for (int k = 0; k < 4; ++k) {
|
| 8341 |
+
const int8_t * signs = keven_signs_q2xs + 8*((q2[2*ib+1] >> 7*k) & 127);
|
| 8342 |
+
const float * xk = xb + 8*k;
|
| 8343 |
+
const float * wk = weight + 8*k;
|
| 8344 |
+
const uint8_t * grid = (const uint8_t *)(kgrid_q2xs + aux8[k]);
|
| 8345 |
+
float best_mse = 0; int best_index = aux8[k];
|
| 8346 |
+
for (int j = 0; j < 8; ++j) {
|
| 8347 |
+
float diff = db * grid[j] * signs[j] - xk[j];
|
| 8348 |
+
best_mse += wk[j] * diff * diff;
|
| 8349 |
+
}
|
| 8350 |
+
for (int idx = 0; idx < 256; ++idx) {
|
| 8351 |
+
grid = (const uint8_t *)(kgrid_q2xs + idx);
|
| 8352 |
+
float mse = 0;
|
| 8353 |
+
for (int j = 0; j < 8; ++j) {
|
| 8354 |
+
float diff = db * grid[j] * signs[j] - xk[j];
|
| 8355 |
+
mse += wk[j] * diff * diff;
|
| 8356 |
+
}
|
| 8357 |
+
if (mse < best_mse) {
|
| 8358 |
+
best_mse = mse; best_index = idx;
|
| 8359 |
+
}
|
| 8360 |
+
}
|
| 8361 |
+
u |= (best_index << 8*k);
|
| 8362 |
+
grid = (const uint8_t *)(kgrid_q2xs + best_index);
|
| 8363 |
+
//grid = (const uint8_t *)(kgrid_q2xs + aux8[k]);
|
| 8364 |
+
for (int j = 0; j < 8; ++j) {
|
| 8365 |
+
float q = db * grid[j] * signs[j];
|
| 8366 |
+
sumqx += wk[j] * q * xk[j];
|
| 8367 |
+
sumq2 += wk[j] * q * q;
|
| 8368 |
+
}
|
| 8369 |
+
}
|
| 8370 |
+
q2[2*ib] = u;
|
| 8371 |
+
if (sumq2 > 0) y[ibl].d = GGML_FP32_TO_FP16(d*sumqx/sumq2);
|
| 8372 |
+
}
|
| 8373 |
+
memcpy(y[ibl].qs, q2, QK_K/4);
|
| 8374 |
+
}
|
| 8375 |
+
}
|
| 8376 |
+
|
| 8377 |
+
static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights) {
|
| 8378 |
+
|
| 8379 |
+
const int gindex = iq2_data_index(512);
|
| 8380 |
+
|
| 8381 |
+
const uint64_t * kgrid_q2xs = iq2_data[gindex].grid;
|
| 8382 |
+
const int * kmap_q2xs = iq2_data[gindex].map;
|
| 8383 |
+
const uint16_t * kneighbors_q2xs = iq2_data[gindex].neighbours;
|
| 8384 |
+
|
| 8385 |
+
GGML_ASSERT(quant_weights);
|
| 8386 |
+
GGML_ASSERT(kmap_q2xs);
|
| 8387 |
+
GGML_ASSERT(kgrid_q2xs);
|
| 8388 |
+
GGML_ASSERT(kneighbors_q2xs);
|
| 8389 |
+
GGML_ASSERT(n%QK_K == 0);
|
| 8390 |
+
|
| 8391 |
+
const int kMaxQ = 3;
|
| 8392 |
+
|
| 8393 |
+
const int nbl = n/256;
|
| 8394 |
+
|
| 8395 |
+
block_iq2_xs * y = vy;
|
| 8396 |
+
|
| 8397 |
+
float scales[QK_K/16];
|
| 8398 |
+
float weight[16];
|
| 8399 |
+
float xval[16];
|
| 8400 |
+
int8_t L[16];
|
| 8401 |
+
int8_t Laux[16];
|
| 8402 |
+
float waux[16];
|
| 8403 |
+
bool is_on_grid[2];
|
| 8404 |
+
bool is_on_grid_aux[2];
|
| 8405 |
+
uint8_t block_signs[2];
|
| 8406 |
+
uint16_t q2[2*(QK_K/16)];
|
| 8407 |
+
|
| 8408 |
+
for (int ibl = 0; ibl < nbl; ++ibl) {
|
| 8409 |
+
|
| 8410 |
+
y[ibl].d = GGML_FP32_TO_FP16(0.f);
|
| 8411 |
+
memset(q2, 0, QK_K/4);
|
| 8412 |
+
memset(y[ibl].scales, 0, QK_K/32);
|
| 8413 |
+
|
| 8414 |
+
float max_scale = 0;
|
| 8415 |
+
|
| 8416 |
+
const float * xbl = x + QK_K*ibl;
|
| 8417 |
+
float sumx2 = 0;
|
| 8418 |
+
for (int i = 0; i < QK_K; ++i) sumx2 += xbl[i]*xbl[i];
|
| 8419 |
+
float sigma2 = sumx2/QK_K;
|
| 8420 |
+
|
| 8421 |
+
for (int ib = 0; ib < QK_K/16; ++ib) {
|
| 8422 |
+
const float * xb = xbl + 16*ib;
|
| 8423 |
+
const float * qw = quant_weights + QK_K*ibl + 16*ib;
|
| 8424 |
+
for (int i = 0; i < 16; ++i) weight[i] = qw[i] * sqrtf(sigma2 + xb[i]*xb[i]);
|
| 8425 |
+
for (int i = 0; i < 16; ++i) waux[i] = sqrtf(weight[i]);
|
| 8426 |
+
for (int k = 0; k < 2; ++k) {
|
| 8427 |
+
int nflip = 0;
|
| 8428 |
+
uint8_t s = 0;
|
| 8429 |
+
for (int i = 0; i < 8; ++i) {
|
| 8430 |
+
if (xb[8*k + i] >= 0) xval[8*k + i] = xb[8*k + i];
|
| 8431 |
+
else {
|
| 8432 |
+
xval[8*k + i] = -xb[8*k + i]; ++nflip; s |= (1 << i);
|
| 8433 |
+
}
|
| 8434 |
+
}
|
| 8435 |
+
if (nflip%2) {
|
| 8436 |
+
int imin = 0; float min = weight[8*k+imin]*xb[8*k+imin]*xb[8*k+imin];
|
| 8437 |
+
for (int i = 1; i < 8; ++i) {
|
| 8438 |
+
float ax = weight[8*k+i]*xb[8*k+i]*xb[8*k+i];
|
| 8439 |
+
if (ax < min) {
|
| 8440 |
+
min = ax; imin = i;
|
| 8441 |
+
}
|
| 8442 |
+
}
|
| 8443 |
+
xval[8*k+imin] = -xval[8*k+imin];
|
| 8444 |
+
s ^= (1 << imin);
|
| 8445 |
+
}
|
| 8446 |
+
block_signs[k] = s & 127;
|
| 8447 |
+
}
|
| 8448 |
+
float max = xval[0];
|
| 8449 |
+
for (int i = 1; i < 16; ++i) max = MAX(max, xval[i]);
|
| 8450 |
+
if (!max) {
|
| 8451 |
+
scales[ib] = 0;
|
| 8452 |
+
memset(L, 0, 16);
|
| 8453 |
+
continue;
|
| 8454 |
+
}
|
| 8455 |
+
float best = 0;
|
| 8456 |
+
float scale = max/(2*kMaxQ-1);
|
| 8457 |
+
is_on_grid[0] = is_on_grid[1] = true;
|
| 8458 |
+
for (int is = -9; is <= 9; ++is) {
|
| 8459 |
+
float id = (2*kMaxQ-1+is*0.1f)/max;
|
| 8460 |
+
float this_scale = 1/id;
|
| 8461 |
+
for (int k = 0; k < 2; ++k) {
|
| 8462 |
+
for (int i = 0; i < 8; ++i) {
|
| 8463 |
+
int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
|
| 8464 |
+
Laux[8*k+i] = MAX(0, MIN(kMaxQ-1, l));
|
| 8465 |
+
}
|
| 8466 |
+
uint16_t u = 0;
|
| 8467 |
+
for (int i = 0; i < 8; ++i) u |= (Laux[8*k+i] << 2*i);
|
| 8468 |
+
int grid_index = kmap_q2xs[u];
|
| 8469 |
+
is_on_grid_aux[k] = true;
|
| 8470 |
+
if (grid_index < 0) {
|
| 8471 |
+
is_on_grid_aux[k] = false;
|
| 8472 |
+
const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
|
| 8473 |
+
grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, this_scale, Laux + 8*k);
|
| 8474 |
+
}
|
| 8475 |
+
}
|
| 8476 |
+
float sumqx = 0, sumq2 = 0;
|
| 8477 |
+
for (int i = 0; i < 16; ++i) {
|
| 8478 |
+
float w = weight[i];
|
| 8479 |
+
float q = 2*Laux[i] + 1;
|
| 8480 |
+
sumqx += w*xval[i]*q;
|
| 8481 |
+
sumq2 += w*q*q;
|
| 8482 |
+
}
|
| 8483 |
+
if (sumq2 > 0 && sumqx*sumqx > best*sumq2) {
|
| 8484 |
+
scale = sumqx/sumq2; best = scale*sumqx;
|
| 8485 |
+
for (int i = 0; i < 16; ++i) L[i] = Laux[i];
|
| 8486 |
+
for (int k = 0; k < 2; ++k) is_on_grid[k] = is_on_grid_aux[k];
|
| 8487 |
+
}
|
| 8488 |
+
}
|
| 8489 |
+
int n_not_ongrid = 0;
|
| 8490 |
+
for (int k = 0; k < 2; ++k) if (!is_on_grid[k]) ++n_not_ongrid;
|
| 8491 |
+
if (n_not_ongrid > 0 && scale > 0) {
|
| 8492 |
+
float id = 1/scale;
|
| 8493 |
+
for (int k = 0; k < 2; ++k) {
|
| 8494 |
+
if (is_on_grid[k]) continue;
|
| 8495 |
+
uint16_t u = 0;
|
| 8496 |
+
for (int i = 0; i < 8; ++i) {
|
| 8497 |
+
int l = nearest_int(0.5f*(id*xval[8*k+i]-1));
|
| 8498 |
+
l = MAX(0, MIN(kMaxQ-1, l));
|
| 8499 |
+
u |= (l << 2*i);
|
| 8500 |
+
L[8*k + i] = l;
|
| 8501 |
+
}
|
| 8502 |
+
int grid_index = kmap_q2xs[u];
|
| 8503 |
+
if (grid_index < 0) {
|
| 8504 |
+
const uint16_t * neighbours = kneighbors_q2xs - kmap_q2xs[u] - 1;
|
| 8505 |
+
grid_index = iq2_find_best_neighbour(neighbours, kgrid_q2xs, xval + 8*k, waux + 8*k, scale, L + 8*k);
|
| 8506 |
+
}
|
| 8507 |
+
}
|
| 8508 |
+
float sumqx = 0, sumq2 = 0;
|
| 8509 |
+
for (int i = 0; i < 16; ++i) {
|
| 8510 |
+
float w = weight[i];
|
| 8511 |
+
float q = 2*L[i] + 1;
|
| 8512 |
+
sumqx += w*xval[i]*q;
|
| 8513 |
+
sumq2 += w*q*q;
|
| 8514 |
+
}
|
| 8515 |
+
if (sumq2 > 0) scale = sumqx/sumq2;
|
| 8516 |
+
}
|
| 8517 |
+
if (scale < 0) {
|
| 8518 |
+
scale = -scale;
|
| 8519 |
+
for (int k = 0; k < 2; ++k) block_signs[k] = (~block_signs[k]) & 127;
|
| 8520 |
+
}
|
| 8521 |
+
for (int k = 0; k < 2; ++k) {
|
| 8522 |
+
uint16_t u = 0;
|
| 8523 |
+
for (int i = 0; i < 8; ++i) u |= (L[8*k+i] << 2*i);
|
| 8524 |
+
int grid_index = kmap_q2xs[u];
|
| 8525 |
+
if (grid_index < 0) {
|
| 8526 |
+
printf("Oops: found point %u not on grid:", u);
|
| 8527 |
+
for (int i = 0; i < 8; ++i) printf(" %d", L[8*k+i]);
|
| 8528 |
+
printf("\n");
|
| 8529 |
+
GGML_ASSERT(false);
|
| 8530 |
+
}
|
| 8531 |
+
q2[2*ib+k] = grid_index | (block_signs[k] << 9);
|
| 8532 |
+
}
|
| 8533 |
+
GGML_ASSERT(scale >= 0);
|
| 8534 |
+
scales[ib] = scale;
|
| 8535 |
+
max_scale = MAX(max_scale, scale);
|
| 8536 |
+
}
|
| 8537 |
+
|
| 8538 |
+
if (!max_scale) {
|
| 8539 |
+
memset(y[ibl].qs, 0, QK_K/4);
|
| 8540 |
+
continue;
|
| 8541 |
+
}
|
| 8542 |
+
|
| 8543 |
+
float d = max_scale/31;
|
| 8544 |
+
y[ibl].d = GGML_FP32_TO_FP16(d);
|
| 8545 |
+
float id = 1/d;
|
| 8546 |
+
for (int ib = 0; ib < QK_K/16; ++ib) {
|
| 8547 |
+
int l = nearest_int(0.5f*(id*scales[ib]-1));
|
| 8548 |
+
l = MAX(0, MIN(15, l));
|
| 8549 |
+
if (ib%2 == 0) y[ibl].scales[ib/2] = l;
|
| 8550 |
+
else y[ibl].scales[ib/2] |= (l << 4);
|
| 8551 |
+
}
|
| 8552 |
+
memcpy(y[ibl].qs, q2, QK_K/4);
|
| 8553 |
+
|
| 8554 |
+
}
|
| 8555 |
+
}
|
| 8556 |
+
|
| 8557 |
+
size_t quantize_iq2_xxs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
| 8558 |
+
(void)hist;
|
| 8559 |
+
GGML_ASSERT(n_per_row%QK_K == 0);
|
| 8560 |
+
int nblock = n_per_row/QK_K;
|
| 8561 |
+
char * qrow = (char *)dst;
|
| 8562 |
+
for (int row = 0; row < nrow; ++row) {
|
| 8563 |
+
quantize_row_iq2_xxs_impl(src, qrow, n_per_row, quant_weights);
|
| 8564 |
+
src += n_per_row;
|
| 8565 |
+
qrow += nblock*sizeof(block_iq2_xxs);
|
| 8566 |
+
}
|
| 8567 |
+
return nrow * nblock * sizeof(block_iq2_xxs);
|
| 8568 |
+
}
|
| 8569 |
+
|
| 8570 |
+
size_t quantize_iq2_xs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
|
| 8571 |
+
(void)hist;
|
| 8572 |
+
GGML_ASSERT(n_per_row%QK_K == 0);
|
| 8573 |
+
int nblock = n_per_row/QK_K;
|
| 8574 |
+
char * qrow = (char *)dst;
|
| 8575 |
+
for (int row = 0; row < nrow; ++row) {
|
| 8576 |
+
quantize_row_iq2_xs_impl(src, qrow, n_per_row, quant_weights);
|
| 8577 |
+
src += n_per_row;
|
| 8578 |
+
qrow += nblock*sizeof(block_iq2_xs);
|
| 8579 |
+
}
|
| 8580 |
+
return nrow * nblock * sizeof(block_iq2_xs);
|
| 8581 |
+
}
|
| 8582 |
+
|
ggml-quants.h
CHANGED
|
@@ -196,8 +196,6 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict
|
|
| 196 |
void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k);
|
| 197 |
void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
|
| 198 |
void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
|
| 199 |
-
void quantize_row_iq2_xxs_reference(const float * restrict x, block_iq2_xxs * restrict y, int k);
|
| 200 |
-
void quantize_row_iq2_xs_reference (const float * restrict x, block_iq2_xs * restrict y, int k);
|
| 201 |
|
| 202 |
void quantize_row_q4_0(const float * restrict x, void * restrict y, int k);
|
| 203 |
void quantize_row_q4_1(const float * restrict x, void * restrict y, int k);
|
|
@@ -212,8 +210,6 @@ void quantize_row_q4_K(const float * restrict x, void * restrict y, int k);
|
|
| 212 |
void quantize_row_q5_K(const float * restrict x, void * restrict y, int k);
|
| 213 |
void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);
|
| 214 |
void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
|
| 215 |
-
void quantize_row_iq2_xxs(const float * restrict x, void * restrict y, int k);
|
| 216 |
-
void quantize_row_iq2_xs (const float * restrict x, void * restrict y, int k);
|
| 217 |
|
| 218 |
// Dequantization
|
| 219 |
void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k);
|
|
@@ -246,3 +242,11 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx,
|
|
| 246 |
void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
| 247 |
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
| 248 |
void ggml_vec_dot_iq2_xs_q8_K (int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 196 |
void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k);
|
| 197 |
void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k);
|
| 198 |
void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k);
|
|
|
|
|
|
|
| 199 |
|
| 200 |
void quantize_row_q4_0(const float * restrict x, void * restrict y, int k);
|
| 201 |
void quantize_row_q4_1(const float * restrict x, void * restrict y, int k);
|
|
|
|
| 210 |
void quantize_row_q5_K(const float * restrict x, void * restrict y, int k);
|
| 211 |
void quantize_row_q6_K(const float * restrict x, void * restrict y, int k);
|
| 212 |
void quantize_row_q8_K(const float * restrict x, void * restrict y, int k);
|
|
|
|
|
|
|
| 213 |
|
| 214 |
// Dequantization
|
| 215 |
void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k);
|
|
|
|
| 242 |
void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
| 243 |
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
| 244 |
void ggml_vec_dot_iq2_xs_q8_K (int n, float * restrict s, const void * restrict vx, const void * restrict vy);
|
| 245 |
+
|
| 246 |
+
//
|
| 247 |
+
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
| 248 |
+
//
|
| 249 |
+
size_t quantize_iq2_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
| 250 |
+
size_t quantize_iq2_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
| 251 |
+
size_t quantize_q2_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
| 252 |
+
|
ggml.c
CHANGED
|
@@ -585,8 +585,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
| 585 |
.type_size = sizeof(block_iq2_xxs),
|
| 586 |
.is_quantized = true,
|
| 587 |
.to_float = (ggml_to_float_t) dequantize_row_iq2_xxs,
|
| 588 |
-
.from_float =
|
| 589 |
-
.from_float_reference =
|
| 590 |
.vec_dot = ggml_vec_dot_iq2_xxs_q8_K,
|
| 591 |
.vec_dot_type = GGML_TYPE_Q8_K,
|
| 592 |
},
|
|
@@ -596,8 +596,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
| 596 |
.type_size = sizeof(block_iq2_xs),
|
| 597 |
.is_quantized = true,
|
| 598 |
.to_float = (ggml_to_float_t) dequantize_row_iq2_xs,
|
| 599 |
-
.from_float =
|
| 600 |
-
.from_float_reference =
|
| 601 |
.vec_dot = ggml_vec_dot_iq2_xs_q8_K,
|
| 602 |
.vec_dot_type = GGML_TYPE_Q8_K,
|
| 603 |
},
|
|
@@ -18665,8 +18665,11 @@ size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t *
|
|
| 18665 |
return (n/QK8_0*sizeof(block_q8_0));
|
| 18666 |
}
|
| 18667 |
|
| 18668 |
-
size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start,
|
|
|
|
|
|
|
| 18669 |
size_t result = 0;
|
|
|
|
| 18670 |
switch (type) {
|
| 18671 |
case GGML_TYPE_Q4_0:
|
| 18672 |
{
|
|
@@ -18701,8 +18704,11 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
|
| 18701 |
case GGML_TYPE_Q2_K:
|
| 18702 |
{
|
| 18703 |
GGML_ASSERT(start % QK_K == 0);
|
| 18704 |
-
|
| 18705 |
-
|
|
|
|
|
|
|
|
|
|
| 18706 |
} break;
|
| 18707 |
case GGML_TYPE_Q3_K:
|
| 18708 |
{
|
|
@@ -18731,14 +18737,22 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
|
| 18731 |
case GGML_TYPE_IQ2_XXS:
|
| 18732 |
{
|
| 18733 |
GGML_ASSERT(start % QK_K == 0);
|
| 18734 |
-
|
| 18735 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18736 |
} break;
|
| 18737 |
case GGML_TYPE_IQ2_XS:
|
| 18738 |
{
|
| 18739 |
GGML_ASSERT(start % QK_K == 0);
|
| 18740 |
-
|
| 18741 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18742 |
} break;
|
| 18743 |
case GGML_TYPE_F16:
|
| 18744 |
{
|
|
|
|
| 585 |
.type_size = sizeof(block_iq2_xxs),
|
| 586 |
.is_quantized = true,
|
| 587 |
.to_float = (ggml_to_float_t) dequantize_row_iq2_xxs,
|
| 588 |
+
.from_float = NULL,
|
| 589 |
+
.from_float_reference = NULL,
|
| 590 |
.vec_dot = ggml_vec_dot_iq2_xxs_q8_K,
|
| 591 |
.vec_dot_type = GGML_TYPE_Q8_K,
|
| 592 |
},
|
|
|
|
| 596 |
.type_size = sizeof(block_iq2_xs),
|
| 597 |
.is_quantized = true,
|
| 598 |
.to_float = (ggml_to_float_t) dequantize_row_iq2_xs,
|
| 599 |
+
.from_float = NULL,
|
| 600 |
+
.from_float_reference = NULL,
|
| 601 |
.vec_dot = ggml_vec_dot_iq2_xs_q8_K,
|
| 602 |
.vec_dot_type = GGML_TYPE_Q8_K,
|
| 603 |
},
|
|
|
|
| 18665 |
return (n/QK8_0*sizeof(block_q8_0));
|
| 18666 |
}
|
| 18667 |
|
| 18668 |
+
size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start,
|
| 18669 |
+
int nrows, int n_per_row, int64_t * hist, const float * imatrix) {
|
| 18670 |
+
(void)imatrix;
|
| 18671 |
size_t result = 0;
|
| 18672 |
+
int n = nrows * n_per_row;
|
| 18673 |
switch (type) {
|
| 18674 |
case GGML_TYPE_Q4_0:
|
| 18675 |
{
|
|
|
|
| 18704 |
case GGML_TYPE_Q2_K:
|
| 18705 |
{
|
| 18706 |
GGML_ASSERT(start % QK_K == 0);
|
| 18707 |
+
GGML_ASSERT(start % n_per_row == 0);
|
| 18708 |
+
size_t start_row = start / n_per_row;
|
| 18709 |
+
size_t row_size = ggml_row_size(type, n_per_row);
|
| 18710 |
+
result = quantize_q2_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
| 18711 |
+
GGML_ASSERT(result == row_size * nrows);
|
| 18712 |
} break;
|
| 18713 |
case GGML_TYPE_Q3_K:
|
| 18714 |
{
|
|
|
|
| 18737 |
case GGML_TYPE_IQ2_XXS:
|
| 18738 |
{
|
| 18739 |
GGML_ASSERT(start % QK_K == 0);
|
| 18740 |
+
GGML_ASSERT(start % n_per_row == 0);
|
| 18741 |
+
GGML_ASSERT(imatrix);
|
| 18742 |
+
size_t start_row = start / n_per_row;
|
| 18743 |
+
size_t row_size = ggml_row_size(type, n_per_row);
|
| 18744 |
+
result = quantize_iq2_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
| 18745 |
+
GGML_ASSERT(result == row_size * nrows);
|
| 18746 |
} break;
|
| 18747 |
case GGML_TYPE_IQ2_XS:
|
| 18748 |
{
|
| 18749 |
GGML_ASSERT(start % QK_K == 0);
|
| 18750 |
+
GGML_ASSERT(start % n_per_row == 0);
|
| 18751 |
+
GGML_ASSERT(imatrix);
|
| 18752 |
+
size_t start_row = start / n_per_row;
|
| 18753 |
+
size_t row_size = ggml_row_size(type, n_per_row);
|
| 18754 |
+
result = quantize_iq2_xs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
| 18755 |
+
GGML_ASSERT(result == row_size * nrows);
|
| 18756 |
} break;
|
| 18757 |
case GGML_TYPE_F16:
|
| 18758 |
{
|
ggml.h
CHANGED
|
@@ -2067,10 +2067,13 @@ extern "C" {
|
|
| 2067 |
GGML_API size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
| 2068 |
GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
| 2069 |
GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
| 2070 |
-
GGML_API size_t ggml_quantize_iq2_xxs(const float * src, void * dst, int n, int k, int64_t * hist);
|
| 2071 |
-
GGML_API size_t ggml_quantize_iq2_xs (const float * src, void * dst, int n, int k, int64_t * hist);
|
| 2072 |
|
| 2073 |
-
GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2074 |
|
| 2075 |
//
|
| 2076 |
// Importance matrix
|
|
|
|
| 2067 |
GGML_API size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
| 2068 |
GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
| 2069 |
GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
|
|
|
|
|
|
| 2070 |
|
| 2071 |
+
GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst,
|
| 2072 |
+
int start, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
|
| 2073 |
+
|
| 2074 |
+
// These are needed for IQ2_XS and IQ2_XXS quantizations
|
| 2075 |
+
GGML_API void ggml_init_iq2_quantization(enum ggml_type type);
|
| 2076 |
+
GGML_API void ggml_deinit_iq2_quantization(enum ggml_type type);
|
| 2077 |
|
| 2078 |
//
|
| 2079 |
// Importance matrix
|