Spaces:
Running
Running
ggml : remove old quantization functions (llama/5942)
Browse files* ggml : remove old quantization functions
ggml-ci
* ggml : simplify ggml_quantize_chunk
ggml-ci
* ggml : restrict correctness
ggml-ci
* ggml : remove hist data from the quantization API
ggml-ci
* tests : remove hist usage in test-backend-ops
ggml-ci
* vulkan : remove hist and fix typo
- ggml-quants.c +44 -99
- ggml-quants.h +23 -19
- ggml-vulkan.cpp +1 -39
- ggml.c +45 -294
- ggml.h +8 -15
ggml-quants.c
CHANGED
|
@@ -1704,16 +1704,6 @@ void quantize_row_q2_K(const float * restrict x, void * restrict vy, int k) {
|
|
| 1704 |
quantize_row_q2_K_reference(x, vy, k);
|
| 1705 |
}
|
| 1706 |
|
| 1707 |
-
size_t ggml_quantize_q2_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
| 1708 |
-
(void)hist; // TODO: collect histograms
|
| 1709 |
-
|
| 1710 |
-
for (int j = 0; j < n; j += k) {
|
| 1711 |
-
block_q2_K * restrict y = (block_q2_K *)dst + j/QK_K;
|
| 1712 |
-
quantize_row_q2_K_reference(src + j, y, k);
|
| 1713 |
-
}
|
| 1714 |
-
return (n/QK_K*sizeof(block_q2_K));
|
| 1715 |
-
}
|
| 1716 |
-
|
| 1717 |
static float make_qkx3_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
|
| 1718 |
uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux,
|
| 1719 |
float rmin, float rdelta, int nstep, bool use_mad) {
|
|
@@ -1966,8 +1956,7 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri
|
|
| 1966 |
}
|
| 1967 |
}
|
| 1968 |
|
| 1969 |
-
size_t quantize_q2_K(const float * src, void * dst, int nrow, int n_per_row,
|
| 1970 |
-
(void)hist;
|
| 1971 |
size_t row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row);
|
| 1972 |
if (!quant_weights) {
|
| 1973 |
quantize_row_q2_K_reference(src, dst, nrow*n_per_row);
|
|
@@ -2186,16 +2175,6 @@ void quantize_row_q3_K(const float * restrict x, void * restrict vy, int k) {
|
|
| 2186 |
quantize_row_q3_K_reference(x, vy, k);
|
| 2187 |
}
|
| 2188 |
|
| 2189 |
-
size_t ggml_quantize_q3_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
| 2190 |
-
(void)hist; // TODO: collect histograms
|
| 2191 |
-
|
| 2192 |
-
for (int j = 0; j < n; j += k) {
|
| 2193 |
-
block_q3_K * restrict y = (block_q3_K *)dst + j/QK_K;
|
| 2194 |
-
quantize_row_q3_K_reference(src + j, y, k);
|
| 2195 |
-
}
|
| 2196 |
-
return (n/QK_K*sizeof(block_q3_K));
|
| 2197 |
-
}
|
| 2198 |
-
|
| 2199 |
static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restrict y, int n_per_row, const float * restrict quant_weights) {
|
| 2200 |
#if QK_K != 256
|
| 2201 |
(void)quant_weights;
|
|
@@ -2285,8 +2264,7 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri
|
|
| 2285 |
#endif
|
| 2286 |
}
|
| 2287 |
|
| 2288 |
-
size_t quantize_q3_K(const float * src, void * dst, int nrow, int n_per_row,
|
| 2289 |
-
(void)hist;
|
| 2290 |
size_t row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row);
|
| 2291 |
if (!quant_weights) {
|
| 2292 |
quantize_row_q3_K_reference(src, dst, nrow*n_per_row);
|
|
@@ -2456,17 +2434,6 @@ void quantize_row_q4_K(const float * restrict x, void * restrict vy, int k) {
|
|
| 2456 |
quantize_row_q4_K_reference(x, y, k);
|
| 2457 |
}
|
| 2458 |
|
| 2459 |
-
size_t ggml_quantize_q4_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
| 2460 |
-
assert(k % QK_K == 0);
|
| 2461 |
-
(void)hist; // TODO: collect histograms
|
| 2462 |
-
|
| 2463 |
-
for (int j = 0; j < n; j += k) {
|
| 2464 |
-
block_q4_K * restrict y = (block_q4_K *)dst + j/QK_K;
|
| 2465 |
-
quantize_row_q4_K_reference(src + j, y, k);
|
| 2466 |
-
}
|
| 2467 |
-
return (n/QK_K*sizeof(block_q4_K));
|
| 2468 |
-
}
|
| 2469 |
-
|
| 2470 |
static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restrict y, int n_per_row, const float * quant_weights) {
|
| 2471 |
#if QK_K != 256
|
| 2472 |
(void)quant_weights;
|
|
@@ -2545,8 +2512,7 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri
|
|
| 2545 |
#endif
|
| 2546 |
}
|
| 2547 |
|
| 2548 |
-
size_t quantize_q4_K(const float * src, void * dst, int nrow, int n_per_row,
|
| 2549 |
-
(void)hist;
|
| 2550 |
size_t row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row);
|
| 2551 |
if (!quant_weights) {
|
| 2552 |
quantize_row_q4_K_reference(src, dst, nrow*n_per_row);
|
|
@@ -2757,17 +2723,6 @@ void quantize_row_q5_K(const float * restrict x, void * restrict vy, int k) {
|
|
| 2757 |
quantize_row_q5_K_reference(x, y, k);
|
| 2758 |
}
|
| 2759 |
|
| 2760 |
-
size_t ggml_quantize_q5_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
|
| 2761 |
-
assert(k % QK_K == 0);
|
| 2762 |
-
(void)hist; // TODO: collect histograms
|
| 2763 |
-
|
| 2764 |
-
for (int j = 0; j < n; j += k) {
|
| 2765 |
-
block_q5_K * restrict y = (block_q5_K *)dst + j/QK_K;
|
| 2766 |
-
quantize_row_q5_K_reference(src + j, y, k);
|
| 2767 |
-
}
|
| 2768 |
-
return (n/QK_K*sizeof(block_q5_K));
|
| 2769 |
-
}
|
| 2770 |
-
|
| 2771 |
static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restrict y, int n_per_row, const float * quant_weights) {
|
| 2772 |
#if QK_K != 256
|
| 2773 |
(void)quant_weights;
|
|
@@ -2866,8 +2821,7 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri
|
|
| 2866 |
#endif
|
| 2867 |
}
|
| 2868 |
|
| 2869 |
-
size_t quantize_q5_K(const float * src, void * dst, int nrow, int n_per_row,
|
| 2870 |
-
(void)hist;
|
| 2871 |
size_t row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row);
|
| 2872 |
if (!quant_weights) {
|
| 2873 |
quantize_row_q5_K_reference(src, dst, nrow*n_per_row);
|
|
@@ -3020,17 +2974,6 @@ void quantize_row_q6_K(const float * restrict x, void * restrict vy, int k) {
|
|
| 3020 |
quantize_row_q6_K_reference(x, y, k);
|
| 3021 |
}
|
| 3022 |
|
| 3023 |
-
size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist) {
|
| 3024 |
-
assert(k % QK_K == 0);
|
| 3025 |
-
(void)hist; // TODO: collect histograms
|
| 3026 |
-
|
| 3027 |
-
for (int j = 0; j < n; j += k) {
|
| 3028 |
-
block_q6_K * restrict y = (block_q6_K *)dst + j/QK_K;
|
| 3029 |
-
quantize_row_q6_K_reference(src + j, y, k);
|
| 3030 |
-
}
|
| 3031 |
-
return (n/QK_K*sizeof(block_q6_K));
|
| 3032 |
-
}
|
| 3033 |
-
|
| 3034 |
static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restrict y, int n_per_row, const float * quant_weights) {
|
| 3035 |
#if QK_K != 256
|
| 3036 |
(void)quant_weights;
|
|
@@ -3120,8 +3063,7 @@ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restri
|
|
| 3120 |
#endif
|
| 3121 |
}
|
| 3122 |
|
| 3123 |
-
size_t quantize_q6_K(const float * src, void * dst, int nrow, int n_per_row,
|
| 3124 |
-
(void)hist;
|
| 3125 |
size_t row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row);
|
| 3126 |
if (!quant_weights) {
|
| 3127 |
quantize_row_q6_K_reference(src, dst, nrow*n_per_row);
|
|
@@ -3165,9 +3107,10 @@ static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restri
|
|
| 3165 |
}
|
| 3166 |
}
|
| 3167 |
|
| 3168 |
-
size_t quantize_q4_0(const float * src, void * dst, int nrow, int n_per_row,
|
| 3169 |
if (!quant_weights) {
|
| 3170 |
-
|
|
|
|
| 3171 |
}
|
| 3172 |
size_t row_size = ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
|
| 3173 |
char * qrow = (char *)dst;
|
|
@@ -3209,9 +3152,10 @@ static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restri
|
|
| 3209 |
}
|
| 3210 |
}
|
| 3211 |
|
| 3212 |
-
size_t quantize_q4_1(const float * src, void * dst, int nrow, int n_per_row,
|
| 3213 |
if (!quant_weights) {
|
| 3214 |
-
|
|
|
|
| 3215 |
}
|
| 3216 |
size_t row_size = ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
|
| 3217 |
char * qrow = (char *)dst;
|
|
@@ -3262,9 +3206,10 @@ static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restri
|
|
| 3262 |
}
|
| 3263 |
}
|
| 3264 |
|
| 3265 |
-
size_t quantize_q5_0(const float * src, void * dst, int nrow, int n_per_row,
|
| 3266 |
if (!quant_weights) {
|
| 3267 |
-
|
|
|
|
| 3268 |
}
|
| 3269 |
size_t row_size = ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
|
| 3270 |
char * qrow = (char *)dst;
|
|
@@ -3314,9 +3259,10 @@ static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restri
|
|
| 3314 |
}
|
| 3315 |
}
|
| 3316 |
|
| 3317 |
-
size_t quantize_q5_1(const float * src, void * dst, int nrow, int n_per_row,
|
| 3318 |
if (!quant_weights) {
|
| 3319 |
-
|
|
|
|
| 3320 |
}
|
| 3321 |
size_t row_size = ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
|
| 3322 |
char * qrow = (char *)dst;
|
|
@@ -3328,6 +3274,13 @@ size_t quantize_q5_1(const float * src, void * dst, int nrow, int n_per_row, int
|
|
| 3328 |
return nrow * row_size;
|
| 3329 |
}
|
| 3330 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3331 |
// ====================== "True" 2-bit (de)-quantization
|
| 3332 |
|
| 3333 |
void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k) {
|
|
@@ -9373,7 +9326,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
|
|
| 9373 |
#endif
|
| 9374 |
}
|
| 9375 |
|
| 9376 |
-
void ggml_vec_dot_iq3_s_q8_K (int n, float *
|
| 9377 |
assert(n % QK_K == 0);
|
| 9378 |
assert(nrc == 1);
|
| 9379 |
UNUSED(nrc);
|
|
@@ -9621,7 +9574,7 @@ static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
|
|
| 9621 |
}
|
| 9622 |
#endif
|
| 9623 |
|
| 9624 |
-
void ggml_vec_dot_iq1_s_q8_K (int n, float *
|
| 9625 |
assert(n % QK_K == 0);
|
| 9626 |
assert(nrc == 1);
|
| 9627 |
UNUSED(nrc);
|
|
@@ -10221,7 +10174,7 @@ void iq2xs_init_impl(enum ggml_type type) {
|
|
| 10221 |
int * kmap_q2xs;
|
| 10222 |
uint16_t * kneighbors_q2xs;
|
| 10223 |
|
| 10224 |
-
printf("================================================================= %s(grid_size = %d)\n", __func__, grid_size);
|
| 10225 |
uint64_t * the_grid = (uint64_t *)malloc(grid_size*sizeof(uint64_t));
|
| 10226 |
for (int k = 0; k < grid_size; ++k) {
|
| 10227 |
int8_t * pos = (int8_t *)(the_grid + k);
|
|
@@ -10276,7 +10229,7 @@ void iq2xs_init_impl(enum ggml_type type) {
|
|
| 10276 |
}
|
| 10277 |
num_neighbors += n;
|
| 10278 |
}
|
| 10279 |
-
printf("%s: %d neighbours in total\n", __func__, num_neighbors);
|
| 10280 |
kneighbors_q2xs = (uint16_t *)malloc((num_neighbors + num_not_in_map)*sizeof(uint16_t));
|
| 10281 |
iq2_data[gindex].neighbours = kneighbors_q2xs;
|
| 10282 |
int counter = 0;
|
|
@@ -10699,8 +10652,7 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
|
|
| 10699 |
}
|
| 10700 |
}
|
| 10701 |
|
| 10702 |
-
size_t quantize_iq2_xxs(const float * src, void * dst, int nrow, int n_per_row,
|
| 10703 |
-
(void)hist;
|
| 10704 |
GGML_ASSERT(n_per_row%QK_K == 0);
|
| 10705 |
int nblock = n_per_row/QK_K;
|
| 10706 |
char * qrow = (char *)dst;
|
|
@@ -10712,8 +10664,7 @@ size_t quantize_iq2_xxs(const float * src, void * dst, int nrow, int n_per_row,
|
|
| 10712 |
return nrow * nblock * sizeof(block_iq2_xxs);
|
| 10713 |
}
|
| 10714 |
|
| 10715 |
-
size_t quantize_iq2_xs(const float * src, void * dst, int nrow, int n_per_row,
|
| 10716 |
-
(void)hist;
|
| 10717 |
GGML_ASSERT(n_per_row%QK_K == 0);
|
| 10718 |
int nblock = n_per_row/QK_K;
|
| 10719 |
char * qrow = (char *)dst;
|
|
@@ -10817,7 +10768,7 @@ void iq3xs_init_impl(int grid_size) {
|
|
| 10817 |
int * kmap_q3xs;
|
| 10818 |
uint16_t * kneighbors_q3xs;
|
| 10819 |
|
| 10820 |
-
printf("================================================================= %s(grid_size = %d)\n", __func__, grid_size);
|
| 10821 |
uint32_t * the_grid = (uint32_t *)malloc(grid_size*sizeof(uint32_t));
|
| 10822 |
for (int k = 0; k < grid_size; ++k) {
|
| 10823 |
int8_t * pos = (int8_t *)(the_grid + k);
|
|
@@ -10872,7 +10823,7 @@ void iq3xs_init_impl(int grid_size) {
|
|
| 10872 |
}
|
| 10873 |
num_neighbors += n;
|
| 10874 |
}
|
| 10875 |
-
printf("%s: %d neighbours in total\n", __func__, num_neighbors);
|
| 10876 |
kneighbors_q3xs = (uint16_t *)malloc((num_neighbors + num_not_in_map)*sizeof(uint16_t));
|
| 10877 |
iq3_data[gindex].neighbours = kneighbors_q3xs;
|
| 10878 |
int counter = 0;
|
|
@@ -11155,8 +11106,7 @@ static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, v
|
|
| 11155 |
}
|
| 11156 |
}
|
| 11157 |
|
| 11158 |
-
size_t quantize_iq3_xxs(const float * src, void * dst, int nrow, int n_per_row,
|
| 11159 |
-
(void)hist;
|
| 11160 |
GGML_ASSERT(n_per_row%QK_K == 0);
|
| 11161 |
int nblock = n_per_row/QK_K;
|
| 11162 |
char * qrow = (char *)dst;
|
|
@@ -11362,8 +11312,7 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo
|
|
| 11362 |
}
|
| 11363 |
|
| 11364 |
#define IQ3S_BLOCK_SIZE 32
|
| 11365 |
-
size_t quantize_iq3_s(const float * src, void * dst, int nrow, int n_per_row,
|
| 11366 |
-
(void)hist;
|
| 11367 |
GGML_ASSERT(n_per_row%QK_K == 0);
|
| 11368 |
int nblock = n_per_row/QK_K;
|
| 11369 |
float scales[QK_K/IQ3S_BLOCK_SIZE];
|
|
@@ -11393,7 +11342,7 @@ void quantize_row_iq3_s(const float * restrict x, void * restrict vy, int k) {
|
|
| 11393 |
|
| 11394 |
void quantize_row_iq3_s_reference(const float * restrict x, block_iq3_s * restrict y, int k) {
|
| 11395 |
assert(k % QK_K == 0);
|
| 11396 |
-
quantize_iq3_s(x, y, 1, k, NULL
|
| 11397 |
}
|
| 11398 |
|
| 11399 |
|
|
@@ -11588,8 +11537,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
|
|
| 11588 |
}
|
| 11589 |
}
|
| 11590 |
|
| 11591 |
-
size_t quantize_iq1_s(const float * src, void * dst, int nrow, int n_per_row,
|
| 11592 |
-
(void)hist;
|
| 11593 |
GGML_ASSERT(n_per_row%QK_K == 0);
|
| 11594 |
int nblock = n_per_row/QK_K;
|
| 11595 |
char * qrow = (char *)dst;
|
|
@@ -11614,7 +11562,7 @@ static inline int best_index_int8(int n, const int8_t * val, float x) {
|
|
| 11614 |
return x - val[mu-1] < val[mu] - x ? mu-1 : mu;
|
| 11615 |
}
|
| 11616 |
|
| 11617 |
-
static void quantize_row_iq4_nl_impl(const int super_block_size, const int block_size, const float *
|
| 11618 |
ggml_fp16_t * dh, uint8_t * q4, uint16_t * scales_h, uint8_t * scales_l,
|
| 11619 |
float * scales, float * weight, uint8_t * L,
|
| 11620 |
const int8_t * values,
|
|
@@ -11722,8 +11670,7 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
|
|
| 11722 |
}
|
| 11723 |
}
|
| 11724 |
|
| 11725 |
-
size_t quantize_iq4_nl(const float * src, void * dst, int nrow, int n_per_row,
|
| 11726 |
-
(void)hist;
|
| 11727 |
GGML_ASSERT(n_per_row%QK4_NL == 0);
|
| 11728 |
int nblock = n_per_row/QK4_NL;
|
| 11729 |
char * qrow = (char *)dst;
|
|
@@ -11753,14 +11700,13 @@ void quantize_row_iq4_nl(const float * restrict x, void * restrict vy, int k) {
|
|
| 11753 |
|
| 11754 |
void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * restrict y, int k) {
|
| 11755 |
assert(k % QK4_NL == 0);
|
| 11756 |
-
quantize_iq4_nl(x, y, 1, k, NULL
|
| 11757 |
}
|
| 11758 |
|
| 11759 |
-
size_t quantize_iq4_xs(const float * src, void * dst, int nrow, int n_per_row,
|
| 11760 |
#if QK_K == 64
|
| 11761 |
-
return quantize_iq4_nl(src, dst, nrow, n_per_row,
|
| 11762 |
#else
|
| 11763 |
-
(void)hist;
|
| 11764 |
GGML_ASSERT(n_per_row%QK_K == 0);
|
| 11765 |
int nblock = n_per_row/QK_K;
|
| 11766 |
char * qrow = (char *)dst;
|
|
@@ -11789,7 +11735,7 @@ void quantize_row_iq4_xs(const float * restrict x, void * restrict vy, int k) {
|
|
| 11789 |
|
| 11790 |
void quantize_row_iq4_xs_reference(const float * restrict x, block_iq4_xs * restrict y, int k) {
|
| 11791 |
assert(k % QK_K == 0);
|
| 11792 |
-
quantize_iq4_xs(x, y, 1, k, NULL
|
| 11793 |
}
|
| 11794 |
|
| 11795 |
// =============================== 2.5625 bpw
|
|
@@ -11962,8 +11908,7 @@ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy
|
|
| 11962 |
}
|
| 11963 |
}
|
| 11964 |
|
| 11965 |
-
size_t quantize_iq2_s(const float * src, void * dst, int nrow, int n_per_row,
|
| 11966 |
-
(void)hist;
|
| 11967 |
GGML_ASSERT(n_per_row%QK_K == 0);
|
| 11968 |
int nblock = n_per_row/QK_K;
|
| 11969 |
char * qrow = (char *)dst;
|
|
@@ -11977,7 +11922,7 @@ size_t quantize_iq2_s(const float * src, void * dst, int nrow, int n_per_row, in
|
|
| 11977 |
|
| 11978 |
void quantize_row_iq2_s_reference(const float * restrict x, block_iq2_s * restrict y, int k) {
|
| 11979 |
assert(k % QK_K == 0);
|
| 11980 |
-
quantize_iq2_s(x, y, 1, k, NULL
|
| 11981 |
}
|
| 11982 |
|
| 11983 |
void quantize_row_iq2_s(const float * restrict x, void * restrict vy, int k) {
|
|
|
|
| 1704 |
quantize_row_q2_K_reference(x, vy, k);
|
| 1705 |
}
|
| 1706 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1707 |
static float make_qkx3_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
|
| 1708 |
uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux,
|
| 1709 |
float rmin, float rdelta, int nstep, bool use_mad) {
|
|
|
|
| 1956 |
}
|
| 1957 |
}
|
| 1958 |
|
| 1959 |
+
size_t quantize_q2_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
|
|
|
|
| 1960 |
size_t row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row);
|
| 1961 |
if (!quant_weights) {
|
| 1962 |
quantize_row_q2_K_reference(src, dst, nrow*n_per_row);
|
|
|
|
| 2175 |
quantize_row_q3_K_reference(x, vy, k);
|
| 2176 |
}
|
| 2177 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2178 |
static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restrict y, int n_per_row, const float * restrict quant_weights) {
|
| 2179 |
#if QK_K != 256
|
| 2180 |
(void)quant_weights;
|
|
|
|
| 2264 |
#endif
|
| 2265 |
}
|
| 2266 |
|
| 2267 |
+
size_t quantize_q3_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
|
|
|
|
| 2268 |
size_t row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row);
|
| 2269 |
if (!quant_weights) {
|
| 2270 |
quantize_row_q3_K_reference(src, dst, nrow*n_per_row);
|
|
|
|
| 2434 |
quantize_row_q4_K_reference(x, y, k);
|
| 2435 |
}
|
| 2436 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2437 |
static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restrict y, int n_per_row, const float * quant_weights) {
|
| 2438 |
#if QK_K != 256
|
| 2439 |
(void)quant_weights;
|
|
|
|
| 2512 |
#endif
|
| 2513 |
}
|
| 2514 |
|
| 2515 |
+
size_t quantize_q4_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
|
|
|
|
| 2516 |
size_t row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row);
|
| 2517 |
if (!quant_weights) {
|
| 2518 |
quantize_row_q4_K_reference(src, dst, nrow*n_per_row);
|
|
|
|
| 2723 |
quantize_row_q5_K_reference(x, y, k);
|
| 2724 |
}
|
| 2725 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2726 |
static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restrict y, int n_per_row, const float * quant_weights) {
|
| 2727 |
#if QK_K != 256
|
| 2728 |
(void)quant_weights;
|
|
|
|
| 2821 |
#endif
|
| 2822 |
}
|
| 2823 |
|
| 2824 |
+
size_t quantize_q5_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
|
|
|
|
| 2825 |
size_t row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row);
|
| 2826 |
if (!quant_weights) {
|
| 2827 |
quantize_row_q5_K_reference(src, dst, nrow*n_per_row);
|
|
|
|
| 2974 |
quantize_row_q6_K_reference(x, y, k);
|
| 2975 |
}
|
| 2976 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2977 |
static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restrict y, int n_per_row, const float * quant_weights) {
|
| 2978 |
#if QK_K != 256
|
| 2979 |
(void)quant_weights;
|
|
|
|
| 3063 |
#endif
|
| 3064 |
}
|
| 3065 |
|
| 3066 |
+
size_t quantize_q6_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
|
|
|
|
| 3067 |
size_t row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row);
|
| 3068 |
if (!quant_weights) {
|
| 3069 |
quantize_row_q6_K_reference(src, dst, nrow*n_per_row);
|
|
|
|
| 3107 |
}
|
| 3108 |
}
|
| 3109 |
|
| 3110 |
+
size_t quantize_q4_0(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
|
| 3111 |
if (!quant_weights) {
|
| 3112 |
+
quantize_row_q4_0_reference(src, dst, nrow*n_per_row);
|
| 3113 |
+
return nrow * ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
|
| 3114 |
}
|
| 3115 |
size_t row_size = ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
|
| 3116 |
char * qrow = (char *)dst;
|
|
|
|
| 3152 |
}
|
| 3153 |
}
|
| 3154 |
|
| 3155 |
+
size_t quantize_q4_1(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
|
| 3156 |
if (!quant_weights) {
|
| 3157 |
+
quantize_row_q4_1_reference(src, dst, nrow*n_per_row);
|
| 3158 |
+
return nrow * ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
|
| 3159 |
}
|
| 3160 |
size_t row_size = ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
|
| 3161 |
char * qrow = (char *)dst;
|
|
|
|
| 3206 |
}
|
| 3207 |
}
|
| 3208 |
|
| 3209 |
+
size_t quantize_q5_0(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
|
| 3210 |
if (!quant_weights) {
|
| 3211 |
+
quantize_row_q5_0_reference(src, dst, nrow*n_per_row);
|
| 3212 |
+
return nrow * ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
|
| 3213 |
}
|
| 3214 |
size_t row_size = ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
|
| 3215 |
char * qrow = (char *)dst;
|
|
|
|
| 3259 |
}
|
| 3260 |
}
|
| 3261 |
|
| 3262 |
+
size_t quantize_q5_1(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
|
| 3263 |
if (!quant_weights) {
|
| 3264 |
+
quantize_row_q5_1_reference(src, dst, nrow*n_per_row);
|
| 3265 |
+
return nrow * ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
|
| 3266 |
}
|
| 3267 |
size_t row_size = ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
|
| 3268 |
char * qrow = (char *)dst;
|
|
|
|
| 3274 |
return nrow * row_size;
|
| 3275 |
}
|
| 3276 |
|
| 3277 |
+
size_t quantize_q8_0(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
|
| 3278 |
+
(void)quant_weights; // not used
|
| 3279 |
+
const size_t row_size = ggml_row_size(GGML_TYPE_Q8_0, n_per_row);
|
| 3280 |
+
quantize_row_q8_0_reference(src, dst, nrow*n_per_row);
|
| 3281 |
+
return nrow * row_size;
|
| 3282 |
+
}
|
| 3283 |
+
|
| 3284 |
// ====================== "True" 2-bit (de)-quantization
|
| 3285 |
|
| 3286 |
void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k) {
|
|
|
|
| 9326 |
#endif
|
| 9327 |
}
|
| 9328 |
|
| 9329 |
+
void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
| 9330 |
assert(n % QK_K == 0);
|
| 9331 |
assert(nrc == 1);
|
| 9332 |
UNUSED(nrc);
|
|
|
|
| 9574 |
}
|
| 9575 |
#endif
|
| 9576 |
|
| 9577 |
+
void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
|
| 9578 |
assert(n % QK_K == 0);
|
| 9579 |
assert(nrc == 1);
|
| 9580 |
UNUSED(nrc);
|
|
|
|
| 10174 |
int * kmap_q2xs;
|
| 10175 |
uint16_t * kneighbors_q2xs;
|
| 10176 |
|
| 10177 |
+
//printf("================================================================= %s(grid_size = %d)\n", __func__, grid_size);
|
| 10178 |
uint64_t * the_grid = (uint64_t *)malloc(grid_size*sizeof(uint64_t));
|
| 10179 |
for (int k = 0; k < grid_size; ++k) {
|
| 10180 |
int8_t * pos = (int8_t *)(the_grid + k);
|
|
|
|
| 10229 |
}
|
| 10230 |
num_neighbors += n;
|
| 10231 |
}
|
| 10232 |
+
//printf("%s: %d neighbours in total\n", __func__, num_neighbors);
|
| 10233 |
kneighbors_q2xs = (uint16_t *)malloc((num_neighbors + num_not_in_map)*sizeof(uint16_t));
|
| 10234 |
iq2_data[gindex].neighbours = kneighbors_q2xs;
|
| 10235 |
int counter = 0;
|
|
|
|
| 10652 |
}
|
| 10653 |
}
|
| 10654 |
|
| 10655 |
+
size_t quantize_iq2_xxs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
|
|
|
|
| 10656 |
GGML_ASSERT(n_per_row%QK_K == 0);
|
| 10657 |
int nblock = n_per_row/QK_K;
|
| 10658 |
char * qrow = (char *)dst;
|
|
|
|
| 10664 |
return nrow * nblock * sizeof(block_iq2_xxs);
|
| 10665 |
}
|
| 10666 |
|
| 10667 |
+
size_t quantize_iq2_xs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
|
|
|
|
| 10668 |
GGML_ASSERT(n_per_row%QK_K == 0);
|
| 10669 |
int nblock = n_per_row/QK_K;
|
| 10670 |
char * qrow = (char *)dst;
|
|
|
|
| 10768 |
int * kmap_q3xs;
|
| 10769 |
uint16_t * kneighbors_q3xs;
|
| 10770 |
|
| 10771 |
+
//printf("================================================================= %s(grid_size = %d)\n", __func__, grid_size);
|
| 10772 |
uint32_t * the_grid = (uint32_t *)malloc(grid_size*sizeof(uint32_t));
|
| 10773 |
for (int k = 0; k < grid_size; ++k) {
|
| 10774 |
int8_t * pos = (int8_t *)(the_grid + k);
|
|
|
|
| 10823 |
}
|
| 10824 |
num_neighbors += n;
|
| 10825 |
}
|
| 10826 |
+
//printf("%s: %d neighbours in total\n", __func__, num_neighbors);
|
| 10827 |
kneighbors_q3xs = (uint16_t *)malloc((num_neighbors + num_not_in_map)*sizeof(uint16_t));
|
| 10828 |
iq3_data[gindex].neighbours = kneighbors_q3xs;
|
| 10829 |
int counter = 0;
|
|
|
|
| 11106 |
}
|
| 11107 |
}
|
| 11108 |
|
| 11109 |
+
size_t quantize_iq3_xxs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
|
|
|
|
| 11110 |
GGML_ASSERT(n_per_row%QK_K == 0);
|
| 11111 |
int nblock = n_per_row/QK_K;
|
| 11112 |
char * qrow = (char *)dst;
|
|
|
|
| 11312 |
}
|
| 11313 |
|
| 11314 |
#define IQ3S_BLOCK_SIZE 32
|
| 11315 |
+
size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
|
|
|
|
| 11316 |
GGML_ASSERT(n_per_row%QK_K == 0);
|
| 11317 |
int nblock = n_per_row/QK_K;
|
| 11318 |
float scales[QK_K/IQ3S_BLOCK_SIZE];
|
|
|
|
| 11342 |
|
| 11343 |
void quantize_row_iq3_s_reference(const float * restrict x, block_iq3_s * restrict y, int k) {
|
| 11344 |
assert(k % QK_K == 0);
|
| 11345 |
+
quantize_iq3_s(x, y, 1, k, NULL);
|
| 11346 |
}
|
| 11347 |
|
| 11348 |
|
|
|
|
| 11537 |
}
|
| 11538 |
}
|
| 11539 |
|
| 11540 |
+
size_t quantize_iq1_s(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
|
|
|
|
| 11541 |
GGML_ASSERT(n_per_row%QK_K == 0);
|
| 11542 |
int nblock = n_per_row/QK_K;
|
| 11543 |
char * qrow = (char *)dst;
|
|
|
|
| 11562 |
return x - val[mu-1] < val[mu] - x ? mu-1 : mu;
|
| 11563 |
}
|
| 11564 |
|
| 11565 |
+
static void quantize_row_iq4_nl_impl(const int super_block_size, const int block_size, const float * restrict x,
|
| 11566 |
ggml_fp16_t * dh, uint8_t * q4, uint16_t * scales_h, uint8_t * scales_l,
|
| 11567 |
float * scales, float * weight, uint8_t * L,
|
| 11568 |
const int8_t * values,
|
|
|
|
| 11670 |
}
|
| 11671 |
}
|
| 11672 |
|
| 11673 |
+
size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
|
|
|
|
| 11674 |
GGML_ASSERT(n_per_row%QK4_NL == 0);
|
| 11675 |
int nblock = n_per_row/QK4_NL;
|
| 11676 |
char * qrow = (char *)dst;
|
|
|
|
| 11700 |
|
| 11701 |
void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * restrict y, int k) {
|
| 11702 |
assert(k % QK4_NL == 0);
|
| 11703 |
+
quantize_iq4_nl(x, y, 1, k, NULL);
|
| 11704 |
}
|
| 11705 |
|
| 11706 |
+
size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
|
| 11707 |
#if QK_K == 64
|
| 11708 |
+
return quantize_iq4_nl(src, dst, nrow, n_per_row, quant_weights);
|
| 11709 |
#else
|
|
|
|
| 11710 |
GGML_ASSERT(n_per_row%QK_K == 0);
|
| 11711 |
int nblock = n_per_row/QK_K;
|
| 11712 |
char * qrow = (char *)dst;
|
|
|
|
| 11735 |
|
| 11736 |
void quantize_row_iq4_xs_reference(const float * restrict x, block_iq4_xs * restrict y, int k) {
|
| 11737 |
assert(k % QK_K == 0);
|
| 11738 |
+
quantize_iq4_xs(x, y, 1, k, NULL);
|
| 11739 |
}
|
| 11740 |
|
| 11741 |
// =============================== 2.5625 bpw
|
|
|
|
| 11908 |
}
|
| 11909 |
}
|
| 11910 |
|
| 11911 |
+
size_t quantize_iq2_s(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
|
|
|
|
| 11912 |
GGML_ASSERT(n_per_row%QK_K == 0);
|
| 11913 |
int nblock = n_per_row/QK_K;
|
| 11914 |
char * qrow = (char *)dst;
|
|
|
|
| 11922 |
|
| 11923 |
void quantize_row_iq2_s_reference(const float * restrict x, block_iq2_s * restrict y, int k) {
|
| 11924 |
assert(k % QK_K == 0);
|
| 11925 |
+
quantize_iq2_s(x, y, 1, k, NULL);
|
| 11926 |
}
|
| 11927 |
|
| 11928 |
void quantize_row_iq2_s(const float * restrict x, void * restrict vy, int k) {
|
ggml-quants.h
CHANGED
|
@@ -261,6 +261,7 @@ void quantize_row_q4_K_reference(const float * GGML_RESTRICT x, block_q4_K * GGM
|
|
| 261 |
void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int k);
|
| 262 |
void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int k);
|
| 263 |
void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int k);
|
|
|
|
| 264 |
void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int k);
|
| 265 |
void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int k);
|
| 266 |
void quantize_row_iq4_xs_reference (const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int k);
|
|
@@ -280,6 +281,7 @@ void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in
|
|
| 280 |
void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
| 281 |
void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
| 282 |
void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
|
|
|
| 283 |
void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
| 284 |
void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
| 285 |
void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
|
@@ -300,6 +302,7 @@ void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRI
|
|
| 300 |
void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
| 301 |
void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
| 302 |
void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
|
|
|
| 303 |
void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
| 304 |
void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
| 305 |
void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
|
@@ -321,6 +324,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
|
|
| 321 |
void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
| 322 |
void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
| 323 |
void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
|
|
|
| 324 |
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
| 325 |
void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
| 326 |
void ggml_vec_dot_iq2_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
|
@@ -330,26 +334,26 @@ void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const
|
|
| 330 |
void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
| 331 |
void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
| 332 |
|
| 333 |
-
//
|
| 334 |
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
| 335 |
-
|
| 336 |
-
size_t
|
| 337 |
-
size_t
|
| 338 |
-
size_t
|
| 339 |
-
size_t
|
| 340 |
-
size_t
|
| 341 |
-
size_t
|
| 342 |
-
size_t
|
| 343 |
-
|
| 344 |
-
size_t quantize_q2_K
|
| 345 |
-
size_t quantize_q3_K
|
| 346 |
-
size_t quantize_q4_K
|
| 347 |
-
size_t quantize_q5_K
|
| 348 |
-
size_t quantize_q6_K
|
| 349 |
-
size_t quantize_q4_0
|
| 350 |
-
size_t quantize_q4_1
|
| 351 |
-
size_t quantize_q5_0
|
| 352 |
-
size_t quantize_q5_1
|
|
|
|
| 353 |
|
| 354 |
void iq2xs_init_impl(enum ggml_type type);
|
| 355 |
void iq2xs_free_impl(enum ggml_type type);
|
|
|
|
| 261 |
void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int k);
|
| 262 |
void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int k);
|
| 263 |
void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int k);
|
| 264 |
+
|
| 265 |
void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int k);
|
| 266 |
void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int k);
|
| 267 |
void quantize_row_iq4_xs_reference (const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int k);
|
|
|
|
| 281 |
void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
| 282 |
void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
| 283 |
void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
| 284 |
+
|
| 285 |
void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
| 286 |
void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
| 287 |
void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
|
|
|
|
| 302 |
void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
| 303 |
void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
| 304 |
void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
| 305 |
+
|
| 306 |
void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
| 307 |
void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
| 308 |
void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
|
|
|
|
| 324 |
void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
| 325 |
void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
| 326 |
void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
| 327 |
+
|
| 328 |
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
| 329 |
void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
| 330 |
void ggml_vec_dot_iq2_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
|
|
|
| 334 |
void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
| 335 |
void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
| 336 |
|
|
|
|
| 337 |
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
| 338 |
+
size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
| 339 |
+
size_t quantize_iq2_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
| 340 |
+
size_t quantize_iq2_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
| 341 |
+
size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
| 342 |
+
size_t quantize_iq1_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
| 343 |
+
size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
| 344 |
+
size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
| 345 |
+
size_t quantize_iq3_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
| 346 |
+
|
| 347 |
+
size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
| 348 |
+
size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
| 349 |
+
size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
| 350 |
+
size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
| 351 |
+
size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
| 352 |
+
size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
| 353 |
+
size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
| 354 |
+
size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
| 355 |
+
size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
| 356 |
+
size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
|
| 357 |
|
| 358 |
void iq2xs_init_impl(enum ggml_type type);
|
| 359 |
void iq2xs_free_impl(enum ggml_type type);
|
ggml-vulkan.cpp
CHANGED
|
@@ -4102,45 +4102,7 @@ static void ggml_vk_test_transfer(ggml_backend_vk_context * ctx, size_t ne, bool
|
|
| 4102 |
}
|
| 4103 |
|
| 4104 |
static void ggml_vk_quantize_data(const float * from, void * to, size_t ne, ggml_type quant) {
|
| 4105 |
-
|
| 4106 |
-
|
| 4107 |
-
switch(quant) {
|
| 4108 |
-
case GGML_TYPE_F32:
|
| 4109 |
-
memcpy(to, from, sizeof(float) * ne);
|
| 4110 |
-
break;
|
| 4111 |
-
case GGML_TYPE_Q4_0:
|
| 4112 |
-
ggml_quantize_q4_0(from, to, ne, ne, hist_cur.data());
|
| 4113 |
-
break;
|
| 4114 |
-
case GGML_TYPE_Q4_1:
|
| 4115 |
-
ggml_quantize_q4_1(from, to, ne, ne, hist_cur.data());
|
| 4116 |
-
break;
|
| 4117 |
-
case GGML_TYPE_Q5_0:
|
| 4118 |
-
ggml_quantize_q5_0(from, to, ne, ne, hist_cur.data());
|
| 4119 |
-
break;
|
| 4120 |
-
case GGML_TYPE_Q5_1:
|
| 4121 |
-
ggml_quantize_q5_1(from, to, ne, ne, hist_cur.data());
|
| 4122 |
-
break;
|
| 4123 |
-
case GGML_TYPE_Q8_0:
|
| 4124 |
-
ggml_quantize_q8_0(from, to, ne, ne, hist_cur.data());
|
| 4125 |
-
break;
|
| 4126 |
-
case GGML_TYPE_Q2_K:
|
| 4127 |
-
ggml_quantize_q2_K(from, to, ne, ne, hist_cur.data());
|
| 4128 |
-
break;
|
| 4129 |
-
case GGML_TYPE_Q3_K:
|
| 4130 |
-
ggml_quantize_q3_K(from, to, ne, ne, hist_cur.data());
|
| 4131 |
-
break;
|
| 4132 |
-
case GGML_TYPE_Q4_K:
|
| 4133 |
-
ggml_quantize_q4_K(from, to, ne, ne, hist_cur.data());
|
| 4134 |
-
break;
|
| 4135 |
-
case GGML_TYPE_Q5_K:
|
| 4136 |
-
ggml_quantize_q5_K(from, to, ne, ne, hist_cur.data());
|
| 4137 |
-
break;
|
| 4138 |
-
case GGML_TYPE_Q6_K:
|
| 4139 |
-
ggml_quantize_q6_K(from, to, ne, ne, hist_cur.data());
|
| 4140 |
-
break;
|
| 4141 |
-
default:
|
| 4142 |
-
GGML_ASSERT(false);
|
| 4143 |
-
}
|
| 4144 |
}
|
| 4145 |
|
| 4146 |
static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_type quant) {
|
|
|
|
| 4102 |
}
|
| 4103 |
|
| 4104 |
static void ggml_vk_quantize_data(const float * from, void * to, size_t ne, ggml_type quant) {
|
| 4105 |
+
ggml_quantize_chunk(quant, from, to, 0, 1, ne, nullptr);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4106 |
}
|
| 4107 |
|
| 4108 |
static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_type quant) {
|
ggml.c
CHANGED
|
@@ -20159,133 +20159,6 @@ void ggml_quantize_free(void) {
|
|
| 20159 |
ggml_critical_section_end();
|
| 20160 |
}
|
| 20161 |
|
| 20162 |
-
size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist) {
|
| 20163 |
-
assert(k % QK4_0 == 0);
|
| 20164 |
-
const int nb = k / QK4_0;
|
| 20165 |
-
|
| 20166 |
-
for (int b = 0; b < n; b += k) {
|
| 20167 |
-
block_q4_0 * restrict y = (block_q4_0 *) dst + b/QK4_0;
|
| 20168 |
-
|
| 20169 |
-
quantize_row_q4_0_reference(src + b, y, k);
|
| 20170 |
-
|
| 20171 |
-
for (int i = 0; i < nb; i++) {
|
| 20172 |
-
for (int j = 0; j < QK4_0; j += 2) {
|
| 20173 |
-
const uint8_t vi0 = y[i].qs[j/2] & 0x0F;
|
| 20174 |
-
const uint8_t vi1 = y[i].qs[j/2] >> 4;
|
| 20175 |
-
|
| 20176 |
-
hist[vi0]++;
|
| 20177 |
-
hist[vi1]++;
|
| 20178 |
-
}
|
| 20179 |
-
}
|
| 20180 |
-
}
|
| 20181 |
-
|
| 20182 |
-
return (n/QK4_0*sizeof(block_q4_0));
|
| 20183 |
-
}
|
| 20184 |
-
|
| 20185 |
-
size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist) {
|
| 20186 |
-
assert(k % QK4_1 == 0);
|
| 20187 |
-
const int nb = k / QK4_1;
|
| 20188 |
-
|
| 20189 |
-
for (int b = 0; b < n; b += k) {
|
| 20190 |
-
block_q4_1 * restrict y = (block_q4_1 *) dst + b/QK4_1;
|
| 20191 |
-
|
| 20192 |
-
quantize_row_q4_1_reference(src + b, y, k);
|
| 20193 |
-
|
| 20194 |
-
for (int i = 0; i < nb; i++) {
|
| 20195 |
-
for (int j = 0; j < QK4_1; j += 2) {
|
| 20196 |
-
const uint8_t vi0 = y[i].qs[j/2] & 0x0F;
|
| 20197 |
-
const uint8_t vi1 = y[i].qs[j/2] >> 4;
|
| 20198 |
-
|
| 20199 |
-
hist[vi0]++;
|
| 20200 |
-
hist[vi1]++;
|
| 20201 |
-
}
|
| 20202 |
-
}
|
| 20203 |
-
}
|
| 20204 |
-
|
| 20205 |
-
return (n/QK4_1*sizeof(block_q4_1));
|
| 20206 |
-
}
|
| 20207 |
-
|
| 20208 |
-
size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist) {
|
| 20209 |
-
assert(k % QK5_0 == 0);
|
| 20210 |
-
const int nb = k / QK5_0;
|
| 20211 |
-
|
| 20212 |
-
for (int b = 0; b < n; b += k) {
|
| 20213 |
-
block_q5_0 * restrict y = (block_q5_0 *)dst + b/QK5_0;
|
| 20214 |
-
|
| 20215 |
-
quantize_row_q5_0_reference(src + b, y, k);
|
| 20216 |
-
|
| 20217 |
-
for (int i = 0; i < nb; i++) {
|
| 20218 |
-
uint32_t qh;
|
| 20219 |
-
memcpy(&qh, &y[i].qh, sizeof(qh));
|
| 20220 |
-
|
| 20221 |
-
for (int j = 0; j < QK5_0; j += 2) {
|
| 20222 |
-
const uint8_t vh0 = ((qh & (1u << (j/2 + 0 ))) >> (j/2 + 0 )) << 4;
|
| 20223 |
-
const uint8_t vh1 = ((qh & (1u << (j/2 + 16))) >> (j/2 + 12));
|
| 20224 |
-
|
| 20225 |
-
// cast to 16 bins
|
| 20226 |
-
const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
|
| 20227 |
-
const uint8_t vi1 = ((y[i].qs[j/2] >> 4) | vh1) / 2;
|
| 20228 |
-
|
| 20229 |
-
hist[vi0]++;
|
| 20230 |
-
hist[vi1]++;
|
| 20231 |
-
}
|
| 20232 |
-
}
|
| 20233 |
-
}
|
| 20234 |
-
|
| 20235 |
-
return (n/QK5_0*sizeof(block_q5_0));
|
| 20236 |
-
}
|
| 20237 |
-
|
| 20238 |
-
size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist) {
|
| 20239 |
-
assert(k % QK5_1 == 0);
|
| 20240 |
-
const int nb = k / QK5_1;
|
| 20241 |
-
|
| 20242 |
-
for (int b = 0; b < n; b += k) {
|
| 20243 |
-
block_q5_1 * restrict y = (block_q5_1 *)dst + b/QK5_1;
|
| 20244 |
-
|
| 20245 |
-
quantize_row_q5_1_reference(src + b, y, k);
|
| 20246 |
-
|
| 20247 |
-
for (int i = 0; i < nb; i++) {
|
| 20248 |
-
uint32_t qh;
|
| 20249 |
-
memcpy(&qh, &y[i].qh, sizeof(qh));
|
| 20250 |
-
|
| 20251 |
-
for (int j = 0; j < QK5_1; j += 2) {
|
| 20252 |
-
const uint8_t vh0 = ((qh & (1u << (j/2 + 0 ))) >> (j/2 + 0 )) << 4;
|
| 20253 |
-
const uint8_t vh1 = ((qh & (1u << (j/2 + 16))) >> (j/2 + 12));
|
| 20254 |
-
|
| 20255 |
-
// cast to 16 bins
|
| 20256 |
-
const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
|
| 20257 |
-
const uint8_t vi1 = ((y[i].qs[j/2] >> 4) | vh1) / 2;
|
| 20258 |
-
|
| 20259 |
-
hist[vi0]++;
|
| 20260 |
-
hist[vi1]++;
|
| 20261 |
-
}
|
| 20262 |
-
}
|
| 20263 |
-
}
|
| 20264 |
-
|
| 20265 |
-
return (n/QK5_1*sizeof(block_q5_1));
|
| 20266 |
-
}
|
| 20267 |
-
|
| 20268 |
-
size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist) {
|
| 20269 |
-
assert(k % QK8_0 == 0);
|
| 20270 |
-
const int nb = k / QK8_0;
|
| 20271 |
-
|
| 20272 |
-
for (int b = 0; b < n; b += k) {
|
| 20273 |
-
block_q8_0 * restrict y = (block_q8_0 *)dst + b/QK8_0;
|
| 20274 |
-
|
| 20275 |
-
quantize_row_q8_0_reference(src + b, y, k);
|
| 20276 |
-
|
| 20277 |
-
for (int i = 0; i < nb; i++) {
|
| 20278 |
-
for (int j = 0; j < QK8_0; ++j) {
|
| 20279 |
-
const int8_t vi = y[i].qs[j];
|
| 20280 |
-
|
| 20281 |
-
hist[vi/16 + 8]++;
|
| 20282 |
-
}
|
| 20283 |
-
}
|
| 20284 |
-
}
|
| 20285 |
-
|
| 20286 |
-
return (n/QK8_0*sizeof(block_q8_0));
|
| 20287 |
-
}
|
| 20288 |
-
|
| 20289 |
bool ggml_quantize_requires_imatrix(enum ggml_type type) {
|
| 20290 |
return
|
| 20291 |
type == GGML_TYPE_IQ2_XXS ||
|
|
@@ -20293,177 +20166,52 @@ bool ggml_quantize_requires_imatrix(enum ggml_type type) {
|
|
| 20293 |
type == GGML_TYPE_IQ1_S;
|
| 20294 |
}
|
| 20295 |
|
| 20296 |
-
size_t ggml_quantize_chunk(
|
| 20297 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20298 |
ggml_quantize_init(type); // this is noop if already initialized
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20299 |
size_t result = 0;
|
| 20300 |
-
|
| 20301 |
switch (type) {
|
| 20302 |
-
case GGML_TYPE_Q4_0:
|
| 20303 |
-
|
| 20304 |
-
|
| 20305 |
-
|
| 20306 |
-
|
| 20307 |
-
|
| 20308 |
-
|
| 20309 |
-
|
| 20310 |
-
|
| 20311 |
-
case
|
| 20312 |
-
|
| 20313 |
-
|
| 20314 |
-
|
| 20315 |
-
|
| 20316 |
-
|
| 20317 |
-
|
| 20318 |
-
|
| 20319 |
-
} break;
|
| 20320 |
-
case GGML_TYPE_Q5_0:
|
| 20321 |
-
{
|
| 20322 |
-
GGML_ASSERT(start % QK5_0 == 0);
|
| 20323 |
-
GGML_ASSERT(start % n_per_row == 0);
|
| 20324 |
-
size_t start_row = start / n_per_row;
|
| 20325 |
-
size_t row_size = ggml_row_size(type, n_per_row);
|
| 20326 |
-
result = quantize_q5_0(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
| 20327 |
-
GGML_ASSERT(result == row_size * nrows);
|
| 20328 |
-
} break;
|
| 20329 |
-
case GGML_TYPE_Q5_1:
|
| 20330 |
-
{
|
| 20331 |
-
GGML_ASSERT(start % QK5_1 == 0);
|
| 20332 |
-
GGML_ASSERT(start % n_per_row == 0);
|
| 20333 |
-
size_t start_row = start / n_per_row;
|
| 20334 |
-
size_t row_size = ggml_row_size(type, n_per_row);
|
| 20335 |
-
result = quantize_q5_1(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
| 20336 |
-
GGML_ASSERT(result == row_size * nrows);
|
| 20337 |
-
} break;
|
| 20338 |
-
case GGML_TYPE_Q8_0:
|
| 20339 |
-
{
|
| 20340 |
-
GGML_ASSERT(start % QK8_0 == 0);
|
| 20341 |
-
block_q8_0 * block = (block_q8_0*)dst + start / QK8_0;
|
| 20342 |
-
result = ggml_quantize_q8_0(src + start, block, n, n, hist);
|
| 20343 |
-
} break;
|
| 20344 |
-
case GGML_TYPE_Q2_K:
|
| 20345 |
-
{
|
| 20346 |
-
GGML_ASSERT(start % QK_K == 0);
|
| 20347 |
-
GGML_ASSERT(start % n_per_row == 0);
|
| 20348 |
-
size_t start_row = start / n_per_row;
|
| 20349 |
-
size_t row_size = ggml_row_size(type, n_per_row);
|
| 20350 |
-
result = quantize_q2_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
| 20351 |
-
GGML_ASSERT(result == row_size * nrows);
|
| 20352 |
-
} break;
|
| 20353 |
-
case GGML_TYPE_Q3_K:
|
| 20354 |
-
{
|
| 20355 |
-
GGML_ASSERT(start % QK_K == 0);
|
| 20356 |
-
GGML_ASSERT(start % n_per_row == 0);
|
| 20357 |
-
size_t start_row = start / n_per_row;
|
| 20358 |
-
size_t row_size = ggml_row_size(type, n_per_row);
|
| 20359 |
-
result = quantize_q3_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
| 20360 |
-
GGML_ASSERT(result == row_size * nrows);
|
| 20361 |
-
} break;
|
| 20362 |
-
case GGML_TYPE_Q4_K:
|
| 20363 |
-
{
|
| 20364 |
-
GGML_ASSERT(start % QK_K == 0);
|
| 20365 |
-
GGML_ASSERT(start % n_per_row == 0);
|
| 20366 |
-
size_t start_row = start / n_per_row;
|
| 20367 |
-
size_t row_size = ggml_row_size(type, n_per_row);
|
| 20368 |
-
result = quantize_q4_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
| 20369 |
-
GGML_ASSERT(result == row_size * nrows);
|
| 20370 |
-
} break;
|
| 20371 |
-
case GGML_TYPE_Q5_K:
|
| 20372 |
-
{
|
| 20373 |
-
GGML_ASSERT(start % QK_K == 0);
|
| 20374 |
-
GGML_ASSERT(start % n_per_row == 0);
|
| 20375 |
-
size_t start_row = start / n_per_row;
|
| 20376 |
-
size_t row_size = ggml_row_size(type, n_per_row);
|
| 20377 |
-
result = quantize_q5_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
| 20378 |
-
GGML_ASSERT(result == row_size * nrows);
|
| 20379 |
-
} break;
|
| 20380 |
-
case GGML_TYPE_Q6_K:
|
| 20381 |
-
{
|
| 20382 |
-
GGML_ASSERT(start % QK_K == 0);
|
| 20383 |
-
GGML_ASSERT(start % n_per_row == 0);
|
| 20384 |
-
size_t start_row = start / n_per_row;
|
| 20385 |
-
size_t row_size = ggml_row_size(type, n_per_row);
|
| 20386 |
-
result = quantize_q6_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
| 20387 |
-
GGML_ASSERT(result == row_size * nrows);
|
| 20388 |
-
} break;
|
| 20389 |
-
case GGML_TYPE_IQ2_XXS:
|
| 20390 |
-
{
|
| 20391 |
-
GGML_ASSERT(start % QK_K == 0);
|
| 20392 |
-
GGML_ASSERT(start % n_per_row == 0);
|
| 20393 |
-
GGML_ASSERT(imatrix);
|
| 20394 |
-
size_t start_row = start / n_per_row;
|
| 20395 |
-
size_t row_size = ggml_row_size(type, n_per_row);
|
| 20396 |
-
result = quantize_iq2_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
| 20397 |
-
GGML_ASSERT(result == row_size * nrows);
|
| 20398 |
-
} break;
|
| 20399 |
-
case GGML_TYPE_IQ2_XS:
|
| 20400 |
-
{
|
| 20401 |
-
GGML_ASSERT(start % QK_K == 0);
|
| 20402 |
-
GGML_ASSERT(start % n_per_row == 0);
|
| 20403 |
-
GGML_ASSERT(imatrix);
|
| 20404 |
-
size_t start_row = start / n_per_row;
|
| 20405 |
-
size_t row_size = ggml_row_size(type, n_per_row);
|
| 20406 |
-
result = quantize_iq2_xs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
| 20407 |
-
GGML_ASSERT(result == row_size * nrows);
|
| 20408 |
-
} break;
|
| 20409 |
-
case GGML_TYPE_IQ3_XXS:
|
| 20410 |
-
{
|
| 20411 |
-
GGML_ASSERT(start % QK_K == 0);
|
| 20412 |
-
GGML_ASSERT(start % n_per_row == 0);
|
| 20413 |
-
size_t start_row = start / n_per_row;
|
| 20414 |
-
size_t row_size = ggml_row_size(type, n_per_row);
|
| 20415 |
-
result = quantize_iq3_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
| 20416 |
-
GGML_ASSERT(result == row_size * nrows);
|
| 20417 |
-
} break;
|
| 20418 |
-
case GGML_TYPE_IQ3_S:
|
| 20419 |
-
{
|
| 20420 |
-
GGML_ASSERT(start % QK_K == 0);
|
| 20421 |
-
GGML_ASSERT(start % n_per_row == 0);
|
| 20422 |
-
size_t start_row = start / n_per_row;
|
| 20423 |
-
size_t row_size = ggml_row_size(type, n_per_row);
|
| 20424 |
-
result = quantize_iq3_s(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
| 20425 |
-
GGML_ASSERT(result == row_size * nrows);
|
| 20426 |
-
} break;
|
| 20427 |
-
case GGML_TYPE_IQ2_S:
|
| 20428 |
-
{
|
| 20429 |
-
GGML_ASSERT(start % QK_K == 0);
|
| 20430 |
-
GGML_ASSERT(start % n_per_row == 0);
|
| 20431 |
-
size_t start_row = start / n_per_row;
|
| 20432 |
-
size_t row_size = ggml_row_size(type, n_per_row);
|
| 20433 |
-
result = quantize_iq2_s(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
| 20434 |
-
GGML_ASSERT(result == row_size * nrows);
|
| 20435 |
-
} break;
|
| 20436 |
-
case GGML_TYPE_IQ1_S:
|
| 20437 |
-
{
|
| 20438 |
-
GGML_ASSERT(start % QK_K == 0);
|
| 20439 |
-
GGML_ASSERT(start % n_per_row == 0);
|
| 20440 |
-
size_t start_row = start / n_per_row;
|
| 20441 |
-
size_t row_size = ggml_row_size(type, n_per_row);
|
| 20442 |
-
result = quantize_iq1_s(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
| 20443 |
-
GGML_ASSERT(result == row_size * nrows);
|
| 20444 |
-
} break;
|
| 20445 |
-
case GGML_TYPE_IQ4_NL:
|
| 20446 |
#if QK_K == 64
|
| 20447 |
-
case GGML_TYPE_IQ4_XS:
|
| 20448 |
-
#
|
| 20449 |
-
|
| 20450 |
-
GGML_ASSERT(start % QK4_NL == 0);
|
| 20451 |
-
GGML_ASSERT(start % n_per_row == 0);
|
| 20452 |
-
size_t start_row = start / n_per_row;
|
| 20453 |
-
size_t row_size = ggml_row_size(type, n_per_row);
|
| 20454 |
-
result = quantize_iq4_nl(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
| 20455 |
-
GGML_ASSERT(result == row_size * nrows);
|
| 20456 |
-
} break;
|
| 20457 |
-
#if QK_K != 64
|
| 20458 |
-
case GGML_TYPE_IQ4_XS:
|
| 20459 |
-
{
|
| 20460 |
-
GGML_ASSERT(start % QK_K == 0);
|
| 20461 |
-
GGML_ASSERT(start % n_per_row == 0);
|
| 20462 |
-
size_t start_row = start / n_per_row;
|
| 20463 |
-
size_t row_size = ggml_row_size(type, n_per_row);
|
| 20464 |
-
result = quantize_iq4_xs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
|
| 20465 |
-
GGML_ASSERT(result == row_size * nrows);
|
| 20466 |
-
} break;
|
| 20467 |
#endif
|
| 20468 |
case GGML_TYPE_F16:
|
| 20469 |
{
|
|
@@ -20480,6 +20228,9 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
|
|
| 20480 |
default:
|
| 20481 |
assert(false);
|
| 20482 |
}
|
|
|
|
|
|
|
|
|
|
| 20483 |
return result;
|
| 20484 |
}
|
| 20485 |
|
|
|
|
| 20159 |
ggml_critical_section_end();
|
| 20160 |
}
|
| 20161 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20162 |
bool ggml_quantize_requires_imatrix(enum ggml_type type) {
|
| 20163 |
return
|
| 20164 |
type == GGML_TYPE_IQ2_XXS ||
|
|
|
|
| 20166 |
type == GGML_TYPE_IQ1_S;
|
| 20167 |
}
|
| 20168 |
|
| 20169 |
+
size_t ggml_quantize_chunk(
|
| 20170 |
+
enum ggml_type type,
|
| 20171 |
+
const float * src,
|
| 20172 |
+
void * dst,
|
| 20173 |
+
int start,
|
| 20174 |
+
int nrows,
|
| 20175 |
+
int n_per_row,
|
| 20176 |
+
const float * imatrix) {
|
| 20177 |
+
const int n = nrows * n_per_row;
|
| 20178 |
+
|
| 20179 |
+
if (ggml_quantize_requires_imatrix(type)) {
|
| 20180 |
+
GGML_ASSERT(imatrix != NULL);
|
| 20181 |
+
}
|
| 20182 |
+
|
| 20183 |
+
GGML_ASSERT(start % type_traits[type].blck_size == 0);
|
| 20184 |
+
GGML_ASSERT(start % n_per_row == 0);
|
| 20185 |
+
|
| 20186 |
ggml_quantize_init(type); // this is noop if already initialized
|
| 20187 |
+
|
| 20188 |
+
const size_t start_row = start / n_per_row;
|
| 20189 |
+
const size_t row_size = ggml_row_size(type, n_per_row);
|
| 20190 |
+
|
| 20191 |
size_t result = 0;
|
| 20192 |
+
|
| 20193 |
switch (type) {
|
| 20194 |
+
case GGML_TYPE_Q4_0: result = quantize_q4_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
| 20195 |
+
case GGML_TYPE_Q4_1: result = quantize_q4_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
| 20196 |
+
case GGML_TYPE_Q5_0: result = quantize_q5_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
| 20197 |
+
case GGML_TYPE_Q5_1: result = quantize_q5_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
| 20198 |
+
case GGML_TYPE_Q8_0: result = quantize_q8_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
| 20199 |
+
case GGML_TYPE_Q2_K: result = quantize_q2_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
| 20200 |
+
case GGML_TYPE_Q3_K: result = quantize_q3_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
| 20201 |
+
case GGML_TYPE_Q4_K: result = quantize_q4_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
| 20202 |
+
case GGML_TYPE_Q5_K: result = quantize_q5_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
| 20203 |
+
case GGML_TYPE_Q6_K: result = quantize_q6_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
| 20204 |
+
case GGML_TYPE_IQ2_XXS: result = quantize_iq2_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
| 20205 |
+
case GGML_TYPE_IQ2_XS: result = quantize_iq2_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
| 20206 |
+
case GGML_TYPE_IQ3_XXS: result = quantize_iq3_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
| 20207 |
+
case GGML_TYPE_IQ3_S: result = quantize_iq3_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
| 20208 |
+
case GGML_TYPE_IQ2_S: result = quantize_iq2_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
| 20209 |
+
case GGML_TYPE_IQ1_S: result = quantize_iq1_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
| 20210 |
+
case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20211 |
#if QK_K == 64
|
| 20212 |
+
case GGML_TYPE_IQ4_XS: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
| 20213 |
+
#else
|
| 20214 |
+
case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20215 |
#endif
|
| 20216 |
case GGML_TYPE_F16:
|
| 20217 |
{
|
|
|
|
| 20228 |
default:
|
| 20229 |
assert(false);
|
| 20230 |
}
|
| 20231 |
+
|
| 20232 |
+
GGML_ASSERT(result == nrows * row_size);
|
| 20233 |
+
|
| 20234 |
return result;
|
| 20235 |
}
|
| 20236 |
|
ggml.h
CHANGED
|
@@ -2194,25 +2194,18 @@ extern "C" {
|
|
| 2194 |
GGML_API void ggml_quantize_init(enum ggml_type type);
|
| 2195 |
GGML_API void ggml_quantize_free(void);
|
| 2196 |
|
| 2197 |
-
// TODO: these would probably get removed in favor of the more general ggml_quantize_chunk
|
| 2198 |
-
GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
| 2199 |
-
GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
| 2200 |
-
GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
| 2201 |
-
GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
|
| 2202 |
-
GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
|
| 2203 |
-
|
| 2204 |
-
GGML_API size_t ggml_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
| 2205 |
-
GGML_API size_t ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
| 2206 |
-
GGML_API size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
| 2207 |
-
GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
| 2208 |
-
GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
|
| 2209 |
-
|
| 2210 |
// some quantization type cannot be used without an importance matrix
|
| 2211 |
GGML_API bool ggml_quantize_requires_imatrix(enum ggml_type type);
|
| 2212 |
|
| 2213 |
// calls ggml_quantize_init internally (i.e. can allocate memory)
|
| 2214 |
-
GGML_API size_t ggml_quantize_chunk(
|
| 2215 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2216 |
|
| 2217 |
//
|
| 2218 |
// gguf
|
|
|
|
| 2194 |
GGML_API void ggml_quantize_init(enum ggml_type type);
|
| 2195 |
GGML_API void ggml_quantize_free(void);
|
| 2196 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2197 |
// some quantization type cannot be used without an importance matrix
|
| 2198 |
GGML_API bool ggml_quantize_requires_imatrix(enum ggml_type type);
|
| 2199 |
|
| 2200 |
// calls ggml_quantize_init internally (i.e. can allocate memory)
|
| 2201 |
+
GGML_API size_t ggml_quantize_chunk(
|
| 2202 |
+
enum ggml_type type,
|
| 2203 |
+
const float * src,
|
| 2204 |
+
void * dst,
|
| 2205 |
+
int start,
|
| 2206 |
+
int nrows,
|
| 2207 |
+
int n_per_row,
|
| 2208 |
+
const float * imatrix);
|
| 2209 |
|
| 2210 |
//
|
| 2211 |
// gguf
|