Kawrakow ikawrakow commited on
Commit
e3cd020
·
unverified ·
1 Parent(s): ae45b38

Slight quantization improvement for Q4_K and Q5_K (llama/5361)

Browse files

* Q4_K: slightly better quantization

* Q5_K: slightly better quantization

---------

Co-authored-by: Iwan Kawrakow <[email protected]>

Files changed (1) hide show
  1. ggml-quants.c +33 -42
ggml-quants.c CHANGED
@@ -2381,19 +2381,20 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri
2381
 
2382
  uint8_t L[QK_K];
2383
  uint8_t Laux[32];
 
 
2384
  float weights[32];
2385
- float mins[QK_K/32];
2386
- float scales[QK_K/32];
 
2387
 
2388
  for (int i = 0; i < nb; i++) {
2389
 
2390
  float sum_x2 = 0;
2391
  for (int l = 0; l < QK_K; ++l) sum_x2 += x[l] * x[l];
2392
- float sigma2 = sum_x2/QK_K;
2393
  float av_x = sqrtf(sigma2);
2394
 
2395
- float max_scale = 0; // as we are deducting the min, scales are always positive
2396
- float max_min = 0;
2397
  for (int j = 0; j < QK_K/32; ++j) {
2398
  if (quant_weights) {
2399
  const float * qw = quant_weights + QK_K*i + 32*j;
@@ -2401,25 +2402,17 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri
2401
  } else {
2402
  for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
2403
  }
 
 
 
2404
  scales[j] = make_qkx3_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
2405
- //scales[j] = make_qkx2_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -1.f, 0.1f, 20, false);
2406
- float scale = scales[j];
2407
- if (scale > max_scale) {
2408
- max_scale = scale;
2409
- }
2410
- float min = mins[j];
2411
- if (min > max_min) {
2412
- max_min = min;
2413
- }
2414
  }
2415
 
2416
- float inv_scale = max_scale > 0 ? 63.f/max_scale : 0.f;
2417
- float inv_min = max_min > 0 ? 63.f/max_min : 0.f;
2418
  for (int j = 0; j < QK_K/32; ++j) {
2419
- uint8_t ls = nearest_int(inv_scale*scales[j]);
2420
- uint8_t lm = nearest_int(inv_min*mins[j]);
2421
- ls = MIN(63, ls);
2422
- lm = MIN(63, lm);
2423
  if (j < 4) {
2424
  y[i].scales[j] = ls;
2425
  y[i].scales[j+4] = lm;
@@ -2429,8 +2422,8 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri
2429
  y[i].scales[j-0] |= ((lm >> 4) << 6);
2430
  }
2431
  }
2432
- y[i].d = GGML_FP32_TO_FP16(max_scale/63.f);
2433
- y[i].dmin = GGML_FP32_TO_FP16(max_min/63.f);
2434
 
2435
  uint8_t sc, m;
2436
  for (int j = 0; j < QK_K/32; ++j) {
@@ -2688,20 +2681,21 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri
2688
  const int nb = n_per_row / QK_K;
2689
 
2690
  uint8_t L[QK_K];
2691
- float mins[QK_K/32];
2692
- float scales[QK_K/32];
2693
- float weights[32];
2694
  uint8_t Laux[32];
 
 
 
 
 
 
2695
 
2696
  for (int i = 0; i < nb; i++) {
2697
 
2698
  float sum_x2 = 0;
2699
  for (int l = 0; l < QK_K; ++l) sum_x2 += x[l] * x[l];
2700
- float sigma2 = sum_x2/QK_K;
2701
  float av_x = sqrtf(sigma2);
2702
 
2703
- float max_scale = 0; // as we are deducting the min, scales are always positive
2704
- float max_min = 0;
2705
  for (int j = 0; j < QK_K/32; ++j) {
2706
  if (quant_weights) {
2707
  const float * qw = quant_weights + QK_K*i + 32*j;
@@ -2709,22 +2703,19 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri
2709
  } else {
2710
  for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
2711
  }
 
 
 
 
2712
  scales[j] = make_qkx3_quants(32, 31, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
2713
- float scale = scales[j];
2714
- if (scale > max_scale) {
2715
- max_scale = scale;
2716
- }
2717
- float min = mins[j];
2718
- if (min > max_min) {
2719
- max_min = min;
2720
- }
2721
  }
2722
 
2723
- float inv_scale = max_scale > 0 ? 63.f/max_scale : 0.f;
2724
- float inv_min = max_min > 0 ? 63.f/max_min : 0.f;
 
2725
  for (int j = 0; j < QK_K/32; ++j) {
2726
- uint8_t ls = nearest_int(inv_scale*scales[j]);
2727
- uint8_t lm = nearest_int(inv_min*mins[j]);
2728
  ls = MIN(63, ls);
2729
  lm = MIN(63, lm);
2730
  if (j < 4) {
@@ -2736,8 +2727,8 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri
2736
  y[i].scales[j-0] |= ((lm >> 4) << 6);
2737
  }
2738
  }
2739
- y[i].d = GGML_FP32_TO_FP16(max_scale/63.f);
2740
- y[i].dmin = GGML_FP32_TO_FP16(max_min/63.f);
2741
 
2742
  uint8_t sc, m;
2743
  for (int j = 0; j < QK_K/32; ++j) {
 
2381
 
2382
  uint8_t L[QK_K];
2383
  uint8_t Laux[32];
2384
+ uint8_t Ls[QK_K/32];
2385
+ uint8_t Lm[QK_K/32];
2386
  float weights[32];
2387
+ float sw[QK_K/32];
2388
+ float mins[QK_K/32];
2389
+ float scales[QK_K/32];
2390
 
2391
  for (int i = 0; i < nb; i++) {
2392
 
2393
  float sum_x2 = 0;
2394
  for (int l = 0; l < QK_K; ++l) sum_x2 += x[l] * x[l];
2395
+ float sigma2 = 2*sum_x2/QK_K;
2396
  float av_x = sqrtf(sigma2);
2397
 
 
 
2398
  for (int j = 0; j < QK_K/32; ++j) {
2399
  if (quant_weights) {
2400
  const float * qw = quant_weights + QK_K*i + 32*j;
 
2402
  } else {
2403
  for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
2404
  }
2405
+ float sumw = 0;
2406
+ for (int l = 0; l < 32; ++l) sumw += weights[l];
2407
+ sw[j] = sumw;
2408
  scales[j] = make_qkx3_quants(32, 15, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
 
 
 
 
 
 
 
 
 
2409
  }
2410
 
2411
+ float d_block = make_qp_quants(QK_K/32, 63, scales, Ls, sw);
2412
+ float m_block = make_qp_quants(QK_K/32, 63, mins, Lm, sw);
2413
  for (int j = 0; j < QK_K/32; ++j) {
2414
+ uint8_t ls = Ls[j];
2415
+ uint8_t lm = Lm[j];
 
 
2416
  if (j < 4) {
2417
  y[i].scales[j] = ls;
2418
  y[i].scales[j+4] = lm;
 
2422
  y[i].scales[j-0] |= ((lm >> 4) << 6);
2423
  }
2424
  }
2425
+ y[i].d = GGML_FP32_TO_FP16(d_block);
2426
+ y[i].dmin = GGML_FP32_TO_FP16(m_block);
2427
 
2428
  uint8_t sc, m;
2429
  for (int j = 0; j < QK_K/32; ++j) {
 
2681
  const int nb = n_per_row / QK_K;
2682
 
2683
  uint8_t L[QK_K];
 
 
 
2684
  uint8_t Laux[32];
2685
+ uint8_t Ls[QK_K/32];
2686
+ uint8_t Lm[QK_K/32];
2687
+ float mins[QK_K/32];
2688
+ float scales[QK_K/32];
2689
+ float sw[QK_K/32];
2690
+ float weights[32];
2691
 
2692
  for (int i = 0; i < nb; i++) {
2693
 
2694
  float sum_x2 = 0;
2695
  for (int l = 0; l < QK_K; ++l) sum_x2 += x[l] * x[l];
2696
+ float sigma2 = 2*sum_x2/QK_K;
2697
  float av_x = sqrtf(sigma2);
2698
 
 
 
2699
  for (int j = 0; j < QK_K/32; ++j) {
2700
  if (quant_weights) {
2701
  const float * qw = quant_weights + QK_K*i + 32*j;
 
2703
  } else {
2704
  for (int l = 0; l < 32; ++l) weights[l] = av_x + fabsf(x[32*j + l]);
2705
  }
2706
+ float sumw = 0;
2707
+ for (int l = 0; l < 32; ++l) sumw += weights[l];
2708
+ sw[j] = sumw;
2709
+
2710
  scales[j] = make_qkx3_quants(32, 31, x + 32*j, weights, L + 32*j, &mins[j], Laux, -0.9f, 0.05f, 36, false);
 
 
 
 
 
 
 
 
2711
  }
2712
 
2713
+ float d_block = make_qp_quants(QK_K/32, 63, scales, Ls, sw);
2714
+ float m_block = make_qp_quants(QK_K/32, 63, mins, Lm, sw);
2715
+
2716
  for (int j = 0; j < QK_K/32; ++j) {
2717
+ uint8_t ls = Ls[j];
2718
+ uint8_t lm = Lm[j];
2719
  ls = MIN(63, ls);
2720
  lm = MIN(63, lm);
2721
  if (j < 4) {
 
2727
  y[i].scales[j-0] |= ((lm >> 4) << 6);
2728
  }
2729
  }
2730
+ y[i].d = GGML_FP32_TO_FP16(d_block);
2731
+ y[i].dmin = GGML_FP32_TO_FP16(m_block);
2732
 
2733
  uint8_t sc, m;
2734
  for (int j = 0; j < QK_K/32; ++j) {