Spaces:
Running
Running
Slight quantization improvement for Q4_K and Q5_K (llama/5361)
Browse files* Q4_K: slightly better quantization
* Q5_K: slightly better quantization
---------
Co-authored-by: Iwan Kawrakow <[email protected]>
- ggml-quants.c +33 -42
ggml-quants.c
CHANGED
|
@@ -2381,19 +2381,20 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri
|
|
| 2381 |
|
| 2382 |
uint8_t L[QK_K];
|
| 2383 |
uint8_t Laux[32];
|
|
|
|
|
|
|
| 2384 |
float weights[32];
|
| 2385 |
-
float
|
| 2386 |
-
float
|
|
|
|
| 2387 |
|
| 2388 |
for (int i = 0; i < nb; i++) {
|
| 2389 |
|
| 2390 |
float sum_x2 = 0;
|
| 2391 |
for (int l = 0; l < QK_K; ++l) sum_x2 += x[l] * x[l];
|
| 2392 |
-
float sigma2 = sum_x2/QK_K;
|
| 2393 |
float av_x = sqrtf(sigma2);
|
| 2394 |
|
| 2395 |
-
float max_scale = 0; // as we are deducting the min, scales are always positive
|
| 2396 |
-
float max_min = 0;
|
| 2397 |
for (int j = 0; j < QK_K/32; ++j) {
|
| 2398 |
if (quant_weights) {
|
| 2399 |
const float * qw = quant_weights + QK_K*i + 32*j;
|
|
@@ -2401,25 +2402,17 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri
|
|
| 2401 |
} else {
|
| 2402 |
for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
|
| 2403 |
}
|
|
|
|
|
|
|
|
|
|
| 2404 |
scales[j] = make_qkx3_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
|
| 2405 |
-
//scales[j] = make_qkx2_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -1.f, 0.1f, 20, false);
|
| 2406 |
-
float scale = scales[j];
|
| 2407 |
-
if (scale > max_scale) {
|
| 2408 |
-
max_scale = scale;
|
| 2409 |
-
}
|
| 2410 |
-
float min = mins[j];
|
| 2411 |
-
if (min > max_min) {
|
| 2412 |
-
max_min = min;
|
| 2413 |
-
}
|
| 2414 |
}
|
| 2415 |
|
| 2416 |
-
float
|
| 2417 |
-
float
|
| 2418 |
for (int j = 0; j < QK_K/32; ++j) {
|
| 2419 |
-
uint8_t ls =
|
| 2420 |
-
uint8_t lm =
|
| 2421 |
-
ls = MIN(63, ls);
|
| 2422 |
-
lm = MIN(63, lm);
|
| 2423 |
if (j < 4) {
|
| 2424 |
y[i].scales[j] = ls;
|
| 2425 |
y[i].scales[j+4] = lm;
|
|
@@ -2429,8 +2422,8 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri
|
|
| 2429 |
y[i].scales[j-0] |= ((lm >> 4) << 6);
|
| 2430 |
}
|
| 2431 |
}
|
| 2432 |
-
y[i].d = GGML_FP32_TO_FP16(
|
| 2433 |
-
y[i].dmin = GGML_FP32_TO_FP16(
|
| 2434 |
|
| 2435 |
uint8_t sc, m;
|
| 2436 |
for (int j = 0; j < QK_K/32; ++j) {
|
|
@@ -2688,20 +2681,21 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri
|
|
| 2688 |
const int nb = n_per_row / QK_K;
|
| 2689 |
|
| 2690 |
uint8_t L[QK_K];
|
| 2691 |
-
float mins[QK_K/32];
|
| 2692 |
-
float scales[QK_K/32];
|
| 2693 |
-
float weights[32];
|
| 2694 |
uint8_t Laux[32];
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2695 |
|
| 2696 |
for (int i = 0; i < nb; i++) {
|
| 2697 |
|
| 2698 |
float sum_x2 = 0;
|
| 2699 |
for (int l = 0; l < QK_K; ++l) sum_x2 += x[l] * x[l];
|
| 2700 |
-
float sigma2 = sum_x2/QK_K;
|
| 2701 |
float av_x = sqrtf(sigma2);
|
| 2702 |
|
| 2703 |
-
float max_scale = 0; // as we are deducting the min, scales are always positive
|
| 2704 |
-
float max_min = 0;
|
| 2705 |
for (int j = 0; j < QK_K/32; ++j) {
|
| 2706 |
if (quant_weights) {
|
| 2707 |
const float * qw = quant_weights + QK_K*i + 32*j;
|
|
@@ -2709,22 +2703,19 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri
|
|
| 2709 |
} else {
|
| 2710 |
for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
|
| 2711 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2712 |
scales[j] = make_qkx3_quants(32, 31, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
|
| 2713 |
-
float scale = scales[j];
|
| 2714 |
-
if (scale > max_scale) {
|
| 2715 |
-
max_scale = scale;
|
| 2716 |
-
}
|
| 2717 |
-
float min = mins[j];
|
| 2718 |
-
if (min > max_min) {
|
| 2719 |
-
max_min = min;
|
| 2720 |
-
}
|
| 2721 |
}
|
| 2722 |
|
| 2723 |
-
float
|
| 2724 |
-
float
|
|
|
|
| 2725 |
for (int j = 0; j < QK_K/32; ++j) {
|
| 2726 |
-
uint8_t ls =
|
| 2727 |
-
uint8_t lm =
|
| 2728 |
ls = MIN(63, ls);
|
| 2729 |
lm = MIN(63, lm);
|
| 2730 |
if (j < 4) {
|
|
@@ -2736,8 +2727,8 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri
|
|
| 2736 |
y[i].scales[j-0] |= ((lm >> 4) << 6);
|
| 2737 |
}
|
| 2738 |
}
|
| 2739 |
-
y[i].d = GGML_FP32_TO_FP16(
|
| 2740 |
-
y[i].dmin = GGML_FP32_TO_FP16(
|
| 2741 |
|
| 2742 |
uint8_t sc, m;
|
| 2743 |
for (int j = 0; j < QK_K/32; ++j) {
|
|
|
|
| 2381 |
|
| 2382 |
uint8_t L[QK_K];
|
| 2383 |
uint8_t Laux[32];
|
| 2384 |
+
uint8_t Ls[QK_K/32];
|
| 2385 |
+
uint8_t Lm[QK_K/32];
|
| 2386 |
float weights[32];
|
| 2387 |
+
float sw[QK_K/32];
|
| 2388 |
+
float mins[QK_K/32];
|
| 2389 |
+
float scales[QK_K/32];
|
| 2390 |
|
| 2391 |
for (int i = 0; i < nb; i++) {
|
| 2392 |
|
| 2393 |
float sum_x2 = 0;
|
| 2394 |
for (int l = 0; l < QK_K; ++l) sum_x2 += x[l] * x[l];
|
| 2395 |
+
float sigma2 = 2*sum_x2/QK_K;
|
| 2396 |
float av_x = sqrtf(sigma2);
|
| 2397 |
|
|
|
|
|
|
|
| 2398 |
for (int j = 0; j < QK_K/32; ++j) {
|
| 2399 |
if (quant_weights) {
|
| 2400 |
const float * qw = quant_weights + QK_K*i + 32*j;
|
|
|
|
| 2402 |
} else {
|
| 2403 |
for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
|
| 2404 |
}
|
| 2405 |
+
float sumw = 0;
|
| 2406 |
+
for (int l = 0; l < 32; ++l) sumw += weights[l];
|
| 2407 |
+
sw[j] = sumw;
|
| 2408 |
scales[j] = make_qkx3_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2409 |
}
|
| 2410 |
|
| 2411 |
+
float d_block = make_qp_quants(QK_K/32, 63, scales, Ls, sw);
|
| 2412 |
+
float m_block = make_qp_quants(QK_K/32, 63, mins, Lm, sw);
|
| 2413 |
for (int j = 0; j < QK_K/32; ++j) {
|
| 2414 |
+
uint8_t ls = Ls[j];
|
| 2415 |
+
uint8_t lm = Lm[j];
|
|
|
|
|
|
|
| 2416 |
if (j < 4) {
|
| 2417 |
y[i].scales[j] = ls;
|
| 2418 |
y[i].scales[j+4] = lm;
|
|
|
|
| 2422 |
y[i].scales[j-0] |= ((lm >> 4) << 6);
|
| 2423 |
}
|
| 2424 |
}
|
| 2425 |
+
y[i].d = GGML_FP32_TO_FP16(d_block);
|
| 2426 |
+
y[i].dmin = GGML_FP32_TO_FP16(m_block);
|
| 2427 |
|
| 2428 |
uint8_t sc, m;
|
| 2429 |
for (int j = 0; j < QK_K/32; ++j) {
|
|
|
|
| 2681 |
const int nb = n_per_row / QK_K;
|
| 2682 |
|
| 2683 |
uint8_t L[QK_K];
|
|
|
|
|
|
|
|
|
|
| 2684 |
uint8_t Laux[32];
|
| 2685 |
+
uint8_t Ls[QK_K/32];
|
| 2686 |
+
uint8_t Lm[QK_K/32];
|
| 2687 |
+
float mins[QK_K/32];
|
| 2688 |
+
float scales[QK_K/32];
|
| 2689 |
+
float sw[QK_K/32];
|
| 2690 |
+
float weights[32];
|
| 2691 |
|
| 2692 |
for (int i = 0; i < nb; i++) {
|
| 2693 |
|
| 2694 |
float sum_x2 = 0;
|
| 2695 |
for (int l = 0; l < QK_K; ++l) sum_x2 += x[l] * x[l];
|
| 2696 |
+
float sigma2 = 2*sum_x2/QK_K;
|
| 2697 |
float av_x = sqrtf(sigma2);
|
| 2698 |
|
|
|
|
|
|
|
| 2699 |
for (int j = 0; j < QK_K/32; ++j) {
|
| 2700 |
if (quant_weights) {
|
| 2701 |
const float * qw = quant_weights + QK_K*i + 32*j;
|
|
|
|
| 2703 |
} else {
|
| 2704 |
for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
|
| 2705 |
}
|
| 2706 |
+
float sumw = 0;
|
| 2707 |
+
for (int l = 0; l < 32; ++l) sumw += weights[l];
|
| 2708 |
+
sw[j] = sumw;
|
| 2709 |
+
|
| 2710 |
scales[j] = make_qkx3_quants(32, 31, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2711 |
}
|
| 2712 |
|
| 2713 |
+
float d_block = make_qp_quants(QK_K/32, 63, scales, Ls, sw);
|
| 2714 |
+
float m_block = make_qp_quants(QK_K/32, 63, mins, Lm, sw);
|
| 2715 |
+
|
| 2716 |
for (int j = 0; j < QK_K/32; ++j) {
|
| 2717 |
+
uint8_t ls = Ls[j];
|
| 2718 |
+
uint8_t lm = Lm[j];
|
| 2719 |
ls = MIN(63, ls);
|
| 2720 |
lm = MIN(63, lm);
|
| 2721 |
if (j < 4) {
|
|
|
|
| 2727 |
y[i].scales[j-0] |= ((lm >> 4) << 6);
|
| 2728 |
}
|
| 2729 |
}
|
| 2730 |
+
y[i].d = GGML_FP32_TO_FP16(d_block);
|
| 2731 |
+
y[i].dmin = GGML_FP32_TO_FP16(m_block);
|
| 2732 |
|
| 2733 |
uint8_t sc, m;
|
| 2734 |
for (int j = 0; j < QK_K/32; ++j) {
|