ggerganov commited on
Commit
11a2545
·
unverified ·
1 Parent(s): 0a37735

ggml : remove old quantization functions (llama/5942)

Browse files

* ggml : remove old quantization functions

ggml-ci

* ggml : simplify ggml_quantize_chunk

ggml-ci

* ggml : restrict correctness

ggml-ci

* ggml : remove hist data from the quantization API

ggml-ci

* tests : remove hist usage in test-backend-ops

ggml-ci

* vulkan : remove hist and fix typo

Files changed (5) hide show
  1. ggml-quants.c +44 -99
  2. ggml-quants.h +23 -19
  3. ggml-vulkan.cpp +1 -39
  4. ggml.c +45 -294
  5. ggml.h +8 -15
ggml-quants.c CHANGED
@@ -1704,16 +1704,6 @@ void quantize_row_q2_K(const float * restrict x, void * restrict vy, int k) {
1704
  quantize_row_q2_K_reference(x, vy, k);
1705
  }
1706
 
1707
- size_t ggml_quantize_q2_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
1708
- (void)hist; // TODO: collect histograms
1709
-
1710
- for (int j = 0; j < n; j += k) {
1711
- block_q2_K * restrict y = (block_q2_K *)dst + j/QK_K;
1712
- quantize_row_q2_K_reference(src + j, y, k);
1713
- }
1714
- return (n/QK_K*sizeof(block_q2_K));
1715
- }
1716
-
1717
  static float make_qkx3_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
1718
  uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux,
1719
  float rmin, float rdelta, int nstep, bool use_mad) {
@@ -1966,8 +1956,7 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri
1966
  }
1967
  }
1968
 
1969
- size_t quantize_q2_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
1970
- (void)hist;
1971
  size_t row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row);
1972
  if (!quant_weights) {
1973
  quantize_row_q2_K_reference(src, dst, nrow*n_per_row);
@@ -2186,16 +2175,6 @@ void quantize_row_q3_K(const float * restrict x, void * restrict vy, int k) {
2186
  quantize_row_q3_K_reference(x, vy, k);
2187
  }
2188
 
2189
- size_t ggml_quantize_q3_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
2190
- (void)hist; // TODO: collect histograms
2191
-
2192
- for (int j = 0; j < n; j += k) {
2193
- block_q3_K * restrict y = (block_q3_K *)dst + j/QK_K;
2194
- quantize_row_q3_K_reference(src + j, y, k);
2195
- }
2196
- return (n/QK_K*sizeof(block_q3_K));
2197
- }
2198
-
2199
  static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restrict y, int n_per_row, const float * restrict quant_weights) {
2200
  #if QK_K != 256
2201
  (void)quant_weights;
@@ -2285,8 +2264,7 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri
2285
  #endif
2286
  }
2287
 
2288
- size_t quantize_q3_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
2289
- (void)hist;
2290
  size_t row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row);
2291
  if (!quant_weights) {
2292
  quantize_row_q3_K_reference(src, dst, nrow*n_per_row);
@@ -2456,17 +2434,6 @@ void quantize_row_q4_K(const float * restrict x, void * restrict vy, int k) {
2456
  quantize_row_q4_K_reference(x, y, k);
2457
  }
2458
 
2459
- size_t ggml_quantize_q4_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
2460
- assert(k % QK_K == 0);
2461
- (void)hist; // TODO: collect histograms
2462
-
2463
- for (int j = 0; j < n; j += k) {
2464
- block_q4_K * restrict y = (block_q4_K *)dst + j/QK_K;
2465
- quantize_row_q4_K_reference(src + j, y, k);
2466
- }
2467
- return (n/QK_K*sizeof(block_q4_K));
2468
- }
2469
-
2470
  static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restrict y, int n_per_row, const float * quant_weights) {
2471
  #if QK_K != 256
2472
  (void)quant_weights;
@@ -2545,8 +2512,7 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri
2545
  #endif
2546
  }
2547
 
2548
- size_t quantize_q4_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
2549
- (void)hist;
2550
  size_t row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row);
2551
  if (!quant_weights) {
2552
  quantize_row_q4_K_reference(src, dst, nrow*n_per_row);
@@ -2757,17 +2723,6 @@ void quantize_row_q5_K(const float * restrict x, void * restrict vy, int k) {
2757
  quantize_row_q5_K_reference(x, y, k);
2758
  }
2759
 
2760
- size_t ggml_quantize_q5_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) {
2761
- assert(k % QK_K == 0);
2762
- (void)hist; // TODO: collect histograms
2763
-
2764
- for (int j = 0; j < n; j += k) {
2765
- block_q5_K * restrict y = (block_q5_K *)dst + j/QK_K;
2766
- quantize_row_q5_K_reference(src + j, y, k);
2767
- }
2768
- return (n/QK_K*sizeof(block_q5_K));
2769
- }
2770
-
2771
  static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restrict y, int n_per_row, const float * quant_weights) {
2772
  #if QK_K != 256
2773
  (void)quant_weights;
@@ -2866,8 +2821,7 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri
2866
  #endif
2867
  }
2868
 
2869
- size_t quantize_q5_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
2870
- (void)hist;
2871
  size_t row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row);
2872
  if (!quant_weights) {
2873
  quantize_row_q5_K_reference(src, dst, nrow*n_per_row);
@@ -3020,17 +2974,6 @@ void quantize_row_q6_K(const float * restrict x, void * restrict vy, int k) {
3020
  quantize_row_q6_K_reference(x, y, k);
3021
  }
3022
 
3023
- size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist) {
3024
- assert(k % QK_K == 0);
3025
- (void)hist; // TODO: collect histograms
3026
-
3027
- for (int j = 0; j < n; j += k) {
3028
- block_q6_K * restrict y = (block_q6_K *)dst + j/QK_K;
3029
- quantize_row_q6_K_reference(src + j, y, k);
3030
- }
3031
- return (n/QK_K*sizeof(block_q6_K));
3032
- }
3033
-
3034
  static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restrict y, int n_per_row, const float * quant_weights) {
3035
  #if QK_K != 256
3036
  (void)quant_weights;
@@ -3120,8 +3063,7 @@ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restri
3120
  #endif
3121
  }
3122
 
3123
- size_t quantize_q6_K(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
3124
- (void)hist;
3125
  size_t row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row);
3126
  if (!quant_weights) {
3127
  quantize_row_q6_K_reference(src, dst, nrow*n_per_row);
@@ -3165,9 +3107,10 @@ static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restri
3165
  }
3166
  }
3167
 
3168
- size_t quantize_q4_0(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
3169
  if (!quant_weights) {
3170
- return ggml_quantize_q4_0(src, dst, nrow*n_per_row, n_per_row, hist);
 
3171
  }
3172
  size_t row_size = ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
3173
  char * qrow = (char *)dst;
@@ -3209,9 +3152,10 @@ static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restri
3209
  }
3210
  }
3211
 
3212
- size_t quantize_q4_1(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
3213
  if (!quant_weights) {
3214
- return ggml_quantize_q4_1(src, dst, nrow*n_per_row, n_per_row, hist);
 
3215
  }
3216
  size_t row_size = ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
3217
  char * qrow = (char *)dst;
@@ -3262,9 +3206,10 @@ static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restri
3262
  }
3263
  }
3264
 
3265
- size_t quantize_q5_0(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
3266
  if (!quant_weights) {
3267
- return ggml_quantize_q5_0(src, dst, nrow*n_per_row, n_per_row, hist);
 
3268
  }
3269
  size_t row_size = ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
3270
  char * qrow = (char *)dst;
@@ -3314,9 +3259,10 @@ static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restri
3314
  }
3315
  }
3316
 
3317
- size_t quantize_q5_1(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
3318
  if (!quant_weights) {
3319
- return ggml_quantize_q5_1(src, dst, nrow*n_per_row, n_per_row, hist);
 
3320
  }
3321
  size_t row_size = ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
3322
  char * qrow = (char *)dst;
@@ -3328,6 +3274,13 @@ size_t quantize_q5_1(const float * src, void * dst, int nrow, int n_per_row, int
3328
  return nrow * row_size;
3329
  }
3330
 
 
 
 
 
 
 
 
3331
  // ====================== "True" 2-bit (de)-quantization
3332
 
3333
  void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k) {
@@ -9373,7 +9326,7 @@ void ggml_vec_dot_iq3_xxs_q8_K(int n, float * restrict s, size_t bs, const void
9373
  #endif
9374
  }
9375
 
9376
- void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
9377
  assert(n % QK_K == 0);
9378
  assert(nrc == 1);
9379
  UNUSED(nrc);
@@ -9621,7 +9574,7 @@ static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
9621
  }
9622
  #endif
9623
 
9624
- void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
9625
  assert(n % QK_K == 0);
9626
  assert(nrc == 1);
9627
  UNUSED(nrc);
@@ -10221,7 +10174,7 @@ void iq2xs_init_impl(enum ggml_type type) {
10221
  int * kmap_q2xs;
10222
  uint16_t * kneighbors_q2xs;
10223
 
10224
- printf("================================================================= %s(grid_size = %d)\n", __func__, grid_size);
10225
  uint64_t * the_grid = (uint64_t *)malloc(grid_size*sizeof(uint64_t));
10226
  for (int k = 0; k < grid_size; ++k) {
10227
  int8_t * pos = (int8_t *)(the_grid + k);
@@ -10276,7 +10229,7 @@ void iq2xs_init_impl(enum ggml_type type) {
10276
  }
10277
  num_neighbors += n;
10278
  }
10279
- printf("%s: %d neighbours in total\n", __func__, num_neighbors);
10280
  kneighbors_q2xs = (uint16_t *)malloc((num_neighbors + num_not_in_map)*sizeof(uint16_t));
10281
  iq2_data[gindex].neighbours = kneighbors_q2xs;
10282
  int counter = 0;
@@ -10699,8 +10652,7 @@ static void quantize_row_iq2_xs_impl(const float * restrict x, void * restrict v
10699
  }
10700
  }
10701
 
10702
- size_t quantize_iq2_xxs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
10703
- (void)hist;
10704
  GGML_ASSERT(n_per_row%QK_K == 0);
10705
  int nblock = n_per_row/QK_K;
10706
  char * qrow = (char *)dst;
@@ -10712,8 +10664,7 @@ size_t quantize_iq2_xxs(const float * src, void * dst, int nrow, int n_per_row,
10712
  return nrow * nblock * sizeof(block_iq2_xxs);
10713
  }
10714
 
10715
- size_t quantize_iq2_xs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
10716
- (void)hist;
10717
  GGML_ASSERT(n_per_row%QK_K == 0);
10718
  int nblock = n_per_row/QK_K;
10719
  char * qrow = (char *)dst;
@@ -10817,7 +10768,7 @@ void iq3xs_init_impl(int grid_size) {
10817
  int * kmap_q3xs;
10818
  uint16_t * kneighbors_q3xs;
10819
 
10820
- printf("================================================================= %s(grid_size = %d)\n", __func__, grid_size);
10821
  uint32_t * the_grid = (uint32_t *)malloc(grid_size*sizeof(uint32_t));
10822
  for (int k = 0; k < grid_size; ++k) {
10823
  int8_t * pos = (int8_t *)(the_grid + k);
@@ -10872,7 +10823,7 @@ void iq3xs_init_impl(int grid_size) {
10872
  }
10873
  num_neighbors += n;
10874
  }
10875
- printf("%s: %d neighbours in total\n", __func__, num_neighbors);
10876
  kneighbors_q3xs = (uint16_t *)malloc((num_neighbors + num_not_in_map)*sizeof(uint16_t));
10877
  iq3_data[gindex].neighbours = kneighbors_q3xs;
10878
  int counter = 0;
@@ -11155,8 +11106,7 @@ static void quantize_row_iq3_xxs_impl(int grid_size, const float * restrict x, v
11155
  }
11156
  }
11157
 
11158
- size_t quantize_iq3_xxs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
11159
- (void)hist;
11160
  GGML_ASSERT(n_per_row%QK_K == 0);
11161
  int nblock = n_per_row/QK_K;
11162
  char * qrow = (char *)dst;
@@ -11362,8 +11312,7 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo
11362
  }
11363
 
11364
  #define IQ3S_BLOCK_SIZE 32
11365
- size_t quantize_iq3_s(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
11366
- (void)hist;
11367
  GGML_ASSERT(n_per_row%QK_K == 0);
11368
  int nblock = n_per_row/QK_K;
11369
  float scales[QK_K/IQ3S_BLOCK_SIZE];
@@ -11393,7 +11342,7 @@ void quantize_row_iq3_s(const float * restrict x, void * restrict vy, int k) {
11393
 
11394
  void quantize_row_iq3_s_reference(const float * restrict x, block_iq3_s * restrict y, int k) {
11395
  assert(k % QK_K == 0);
11396
- quantize_iq3_s(x, y, 1, k, NULL, NULL);
11397
  }
11398
 
11399
 
@@ -11588,8 +11537,7 @@ static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy
11588
  }
11589
  }
11590
 
11591
- size_t quantize_iq1_s(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
11592
- (void)hist;
11593
  GGML_ASSERT(n_per_row%QK_K == 0);
11594
  int nblock = n_per_row/QK_K;
11595
  char * qrow = (char *)dst;
@@ -11614,7 +11562,7 @@ static inline int best_index_int8(int n, const int8_t * val, float x) {
11614
  return x - val[mu-1] < val[mu] - x ? mu-1 : mu;
11615
  }
11616
 
11617
- static void quantize_row_iq4_nl_impl(const int super_block_size, const int block_size, const float * GGML_RESTRICT x,
11618
  ggml_fp16_t * dh, uint8_t * q4, uint16_t * scales_h, uint8_t * scales_l,
11619
  float * scales, float * weight, uint8_t * L,
11620
  const int8_t * values,
@@ -11722,8 +11670,7 @@ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block
11722
  }
11723
  }
11724
 
11725
- size_t quantize_iq4_nl(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
11726
- (void)hist;
11727
  GGML_ASSERT(n_per_row%QK4_NL == 0);
11728
  int nblock = n_per_row/QK4_NL;
11729
  char * qrow = (char *)dst;
@@ -11753,14 +11700,13 @@ void quantize_row_iq4_nl(const float * restrict x, void * restrict vy, int k) {
11753
 
11754
  void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * restrict y, int k) {
11755
  assert(k % QK4_NL == 0);
11756
- quantize_iq4_nl(x, y, 1, k, NULL, NULL);
11757
  }
11758
 
11759
- size_t quantize_iq4_xs(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
11760
  #if QK_K == 64
11761
- return quantize_iq4_nl(src, dst, nrow, n_per_row, hist, quant_weights);
11762
  #else
11763
- (void)hist;
11764
  GGML_ASSERT(n_per_row%QK_K == 0);
11765
  int nblock = n_per_row/QK_K;
11766
  char * qrow = (char *)dst;
@@ -11789,7 +11735,7 @@ void quantize_row_iq4_xs(const float * restrict x, void * restrict vy, int k) {
11789
 
11790
  void quantize_row_iq4_xs_reference(const float * restrict x, block_iq4_xs * restrict y, int k) {
11791
  assert(k % QK_K == 0);
11792
- quantize_iq4_xs(x, y, 1, k, NULL, NULL);
11793
  }
11794
 
11795
  // =============================== 2.5625 bpw
@@ -11962,8 +11908,7 @@ static void quantize_row_iq2_s_impl(const float * restrict x, void * restrict vy
11962
  }
11963
  }
11964
 
11965
- size_t quantize_iq2_s(const float * src, void * dst, int nrow, int n_per_row, int64_t * hist, const float * quant_weights) {
11966
- (void)hist;
11967
  GGML_ASSERT(n_per_row%QK_K == 0);
11968
  int nblock = n_per_row/QK_K;
11969
  char * qrow = (char *)dst;
@@ -11977,7 +11922,7 @@ size_t quantize_iq2_s(const float * src, void * dst, int nrow, int n_per_row, in
11977
 
11978
  void quantize_row_iq2_s_reference(const float * restrict x, block_iq2_s * restrict y, int k) {
11979
  assert(k % QK_K == 0);
11980
- quantize_iq2_s(x, y, 1, k, NULL, NULL);
11981
  }
11982
 
11983
  void quantize_row_iq2_s(const float * restrict x, void * restrict vy, int k) {
 
1704
  quantize_row_q2_K_reference(x, vy, k);
1705
  }
1706
 
 
 
 
 
 
 
 
 
 
 
1707
  static float make_qkx3_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
1708
  uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux,
1709
  float rmin, float rdelta, int nstep, bool use_mad) {
 
1956
  }
1957
  }
1958
 
1959
+ size_t quantize_q2_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
 
1960
  size_t row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row);
1961
  if (!quant_weights) {
1962
  quantize_row_q2_K_reference(src, dst, nrow*n_per_row);
 
2175
  quantize_row_q3_K_reference(x, vy, k);
2176
  }
2177
 
 
 
 
 
 
 
 
 
 
 
2178
  static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restrict y, int n_per_row, const float * restrict quant_weights) {
2179
  #if QK_K != 256
2180
  (void)quant_weights;
 
2264
  #endif
2265
  }
2266
 
2267
+ size_t quantize_q3_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
 
2268
  size_t row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row);
2269
  if (!quant_weights) {
2270
  quantize_row_q3_K_reference(src, dst, nrow*n_per_row);
 
2434
  quantize_row_q4_K_reference(x, y, k);
2435
  }
2436
 
 
 
 
 
 
 
 
 
 
 
 
2437
  static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restrict y, int n_per_row, const float * quant_weights) {
2438
  #if QK_K != 256
2439
  (void)quant_weights;
 
2512
  #endif
2513
  }
2514
 
2515
+ size_t quantize_q4_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
 
2516
  size_t row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row);
2517
  if (!quant_weights) {
2518
  quantize_row_q4_K_reference(src, dst, nrow*n_per_row);
 
2723
  quantize_row_q5_K_reference(x, y, k);
2724
  }
2725
 
 
 
 
 
 
 
 
 
 
 
 
2726
  static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restrict y, int n_per_row, const float * quant_weights) {
2727
  #if QK_K != 256
2728
  (void)quant_weights;
 
2821
  #endif
2822
  }
2823
 
2824
+ size_t quantize_q5_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
 
2825
  size_t row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row);
2826
  if (!quant_weights) {
2827
  quantize_row_q5_K_reference(src, dst, nrow*n_per_row);
 
2974
  quantize_row_q6_K_reference(x, y, k);
2975
  }
2976
 
 
 
 
 
 
 
 
 
 
 
 
2977
  static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restrict y, int n_per_row, const float * quant_weights) {
2978
  #if QK_K != 256
2979
  (void)quant_weights;
 
3063
  #endif
3064
  }
3065
 
3066
+ size_t quantize_q6_K(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
 
3067
  size_t row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row);
3068
  if (!quant_weights) {
3069
  quantize_row_q6_K_reference(src, dst, nrow*n_per_row);
 
3107
  }
3108
  }
3109
 
3110
+ size_t quantize_q4_0(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
3111
  if (!quant_weights) {
3112
+ quantize_row_q4_0_reference(src, dst, nrow*n_per_row);
3113
+ return nrow * ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
3114
  }
3115
  size_t row_size = ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
3116
  char * qrow = (char *)dst;
 
3152
  }
3153
  }
3154
 
3155
+ size_t quantize_q4_1(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
3156
  if (!quant_weights) {
3157
+ quantize_row_q4_1_reference(src, dst, nrow*n_per_row);
3158
+ return nrow * ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
3159
  }
3160
  size_t row_size = ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
3161
  char * qrow = (char *)dst;
 
3206
  }
3207
  }
3208
 
3209
+ size_t quantize_q5_0(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
3210
  if (!quant_weights) {
3211
+ quantize_row_q5_0_reference(src, dst, nrow*n_per_row);
3212
+ return nrow * ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
3213
  }
3214
  size_t row_size = ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
3215
  char * qrow = (char *)dst;
 
3259
  }
3260
  }
3261
 
3262
+ size_t quantize_q5_1(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
3263
  if (!quant_weights) {
3264
+ quantize_row_q5_1_reference(src, dst, nrow*n_per_row);
3265
+ return nrow * ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
3266
  }
3267
  size_t row_size = ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
3268
  char * qrow = (char *)dst;
 
3274
  return nrow * row_size;
3275
  }
3276
 
3277
+ size_t quantize_q8_0(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
3278
+ (void)quant_weights; // not used
3279
+ const size_t row_size = ggml_row_size(GGML_TYPE_Q8_0, n_per_row);
3280
+ quantize_row_q8_0_reference(src, dst, nrow*n_per_row);
3281
+ return nrow * row_size;
3282
+ }
3283
+
3284
  // ====================== "True" 2-bit (de)-quantization
3285
 
3286
  void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k) {
 
9326
  #endif
9327
  }
9328
 
9329
+ void ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
9330
  assert(n % QK_K == 0);
9331
  assert(nrc == 1);
9332
  UNUSED(nrc);
 
9574
  }
9575
  #endif
9576
 
9577
+ void ggml_vec_dot_iq1_s_q8_K (int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
9578
  assert(n % QK_K == 0);
9579
  assert(nrc == 1);
9580
  UNUSED(nrc);
 
10174
  int * kmap_q2xs;
10175
  uint16_t * kneighbors_q2xs;
10176
 
10177
+ //printf("================================================================= %s(grid_size = %d)\n", __func__, grid_size);
10178
  uint64_t * the_grid = (uint64_t *)malloc(grid_size*sizeof(uint64_t));
10179
  for (int k = 0; k < grid_size; ++k) {
10180
  int8_t * pos = (int8_t *)(the_grid + k);
 
10229
  }
10230
  num_neighbors += n;
10231
  }
10232
+ //printf("%s: %d neighbours in total\n", __func__, num_neighbors);
10233
  kneighbors_q2xs = (uint16_t *)malloc((num_neighbors + num_not_in_map)*sizeof(uint16_t));
10234
  iq2_data[gindex].neighbours = kneighbors_q2xs;
10235
  int counter = 0;
 
10652
  }
10653
  }
10654
 
10655
+ size_t quantize_iq2_xxs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
 
10656
  GGML_ASSERT(n_per_row%QK_K == 0);
10657
  int nblock = n_per_row/QK_K;
10658
  char * qrow = (char *)dst;
 
10664
  return nrow * nblock * sizeof(block_iq2_xxs);
10665
  }
10666
 
10667
+ size_t quantize_iq2_xs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
 
10668
  GGML_ASSERT(n_per_row%QK_K == 0);
10669
  int nblock = n_per_row/QK_K;
10670
  char * qrow = (char *)dst;
 
10768
  int * kmap_q3xs;
10769
  uint16_t * kneighbors_q3xs;
10770
 
10771
+ //printf("================================================================= %s(grid_size = %d)\n", __func__, grid_size);
10772
  uint32_t * the_grid = (uint32_t *)malloc(grid_size*sizeof(uint32_t));
10773
  for (int k = 0; k < grid_size; ++k) {
10774
  int8_t * pos = (int8_t *)(the_grid + k);
 
10823
  }
10824
  num_neighbors += n;
10825
  }
10826
+ //printf("%s: %d neighbours in total\n", __func__, num_neighbors);
10827
  kneighbors_q3xs = (uint16_t *)malloc((num_neighbors + num_not_in_map)*sizeof(uint16_t));
10828
  iq3_data[gindex].neighbours = kneighbors_q3xs;
10829
  int counter = 0;
 
11106
  }
11107
  }
11108
 
11109
+ size_t quantize_iq3_xxs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
 
11110
  GGML_ASSERT(n_per_row%QK_K == 0);
11111
  int nblock = n_per_row/QK_K;
11112
  char * qrow = (char *)dst;
 
11312
  }
11313
 
11314
  #define IQ3S_BLOCK_SIZE 32
11315
+ size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
 
11316
  GGML_ASSERT(n_per_row%QK_K == 0);
11317
  int nblock = n_per_row/QK_K;
11318
  float scales[QK_K/IQ3S_BLOCK_SIZE];
 
11342
 
11343
  void quantize_row_iq3_s_reference(const float * restrict x, block_iq3_s * restrict y, int k) {
11344
  assert(k % QK_K == 0);
11345
+ quantize_iq3_s(x, y, 1, k, NULL);
11346
  }
11347
 
11348
 
 
11537
  }
11538
  }
11539
 
11540
+ size_t quantize_iq1_s(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
 
11541
  GGML_ASSERT(n_per_row%QK_K == 0);
11542
  int nblock = n_per_row/QK_K;
11543
  char * qrow = (char *)dst;
 
11562
  return x - val[mu-1] < val[mu] - x ? mu-1 : mu;
11563
  }
11564
 
11565
+ static void quantize_row_iq4_nl_impl(const int super_block_size, const int block_size, const float * restrict x,
11566
  ggml_fp16_t * dh, uint8_t * q4, uint16_t * scales_h, uint8_t * scales_l,
11567
  float * scales, float * weight, uint8_t * L,
11568
  const int8_t * values,
 
11670
  }
11671
  }
11672
 
11673
+ size_t quantize_iq4_nl(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
 
11674
  GGML_ASSERT(n_per_row%QK4_NL == 0);
11675
  int nblock = n_per_row/QK4_NL;
11676
  char * qrow = (char *)dst;
 
11700
 
11701
  void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * restrict y, int k) {
11702
  assert(k % QK4_NL == 0);
11703
+ quantize_iq4_nl(x, y, 1, k, NULL);
11704
  }
11705
 
11706
+ size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
11707
  #if QK_K == 64
11708
+ return quantize_iq4_nl(src, dst, nrow, n_per_row, quant_weights);
11709
  #else
 
11710
  GGML_ASSERT(n_per_row%QK_K == 0);
11711
  int nblock = n_per_row/QK_K;
11712
  char * qrow = (char *)dst;
 
11735
 
11736
  void quantize_row_iq4_xs_reference(const float * restrict x, block_iq4_xs * restrict y, int k) {
11737
  assert(k % QK_K == 0);
11738
+ quantize_iq4_xs(x, y, 1, k, NULL);
11739
  }
11740
 
11741
  // =============================== 2.5625 bpw
 
11908
  }
11909
  }
11910
 
11911
+ size_t quantize_iq2_s(const float * restrict src, void * restrict dst, int nrow, int n_per_row, const float * quant_weights) {
 
11912
  GGML_ASSERT(n_per_row%QK_K == 0);
11913
  int nblock = n_per_row/QK_K;
11914
  char * qrow = (char *)dst;
 
11922
 
11923
  void quantize_row_iq2_s_reference(const float * restrict x, block_iq2_s * restrict y, int k) {
11924
  assert(k % QK_K == 0);
11925
+ quantize_iq2_s(x, y, 1, k, NULL);
11926
  }
11927
 
11928
  void quantize_row_iq2_s(const float * restrict x, void * restrict vy, int k) {
ggml-quants.h CHANGED
@@ -261,6 +261,7 @@ void quantize_row_q4_K_reference(const float * GGML_RESTRICT x, block_q4_K * GGM
261
  void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int k);
262
  void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int k);
263
  void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int k);
 
264
  void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int k);
265
  void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int k);
266
  void quantize_row_iq4_xs_reference (const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int k);
@@ -280,6 +281,7 @@ void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, in
280
  void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
281
  void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
282
  void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
 
283
  void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
284
  void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
285
  void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
@@ -300,6 +302,7 @@ void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRI
300
  void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
301
  void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
302
  void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
 
303
  void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
304
  void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
305
  void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
@@ -321,6 +324,7 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
321
  void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
322
  void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
323
  void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 
324
  void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
325
  void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
326
  void ggml_vec_dot_iq2_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
@@ -330,26 +334,26 @@ void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const
330
  void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
331
  void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
332
 
333
- //
334
  // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
335
- //
336
- size_t quantize_iq2_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
337
- size_t quantize_iq2_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
338
- size_t quantize_iq2_s (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
339
- size_t quantize_iq3_xxs(const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
340
- size_t quantize_iq1_s (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
341
- size_t quantize_iq4_nl (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
342
- size_t quantize_iq4_xs (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
343
- size_t quantize_iq3_s (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
344
- size_t quantize_q2_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
345
- size_t quantize_q3_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
346
- size_t quantize_q4_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
347
- size_t quantize_q5_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
348
- size_t quantize_q6_K (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
349
- size_t quantize_q4_0 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
350
- size_t quantize_q4_1 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
351
- size_t quantize_q5_0 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
352
- size_t quantize_q5_1 (const float * src, void * dst, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
 
353
 
354
  void iq2xs_init_impl(enum ggml_type type);
355
  void iq2xs_free_impl(enum ggml_type type);
 
261
  void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int k);
262
  void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int k);
263
  void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int k);
264
+
265
  void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int k);
266
  void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int k);
267
  void quantize_row_iq4_xs_reference (const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int k);
 
281
  void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
282
  void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
283
  void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
284
+
285
  void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
286
  void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
287
  void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k);
 
302
  void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
303
  void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
304
  void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
305
+
306
  void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
307
  void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
308
  void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
 
324
  void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
325
  void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
326
  void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
327
+
328
  void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
329
  void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
330
  void ggml_vec_dot_iq2_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 
334
  void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
335
  void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
336
 
 
337
  // Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
338
+ size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
339
+ size_t quantize_iq2_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
340
+ size_t quantize_iq2_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
341
+ size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
342
+ size_t quantize_iq1_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
343
+ size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
344
+ size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
345
+ size_t quantize_iq3_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
346
+
347
+ size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
348
+ size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
349
+ size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
350
+ size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
351
+ size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
352
+ size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
353
+ size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
354
+ size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
355
+ size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
356
+ size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int nrows, int n_per_row, const float * imatrix);
357
 
358
  void iq2xs_init_impl(enum ggml_type type);
359
  void iq2xs_free_impl(enum ggml_type type);
ggml-vulkan.cpp CHANGED
@@ -4102,45 +4102,7 @@ static void ggml_vk_test_transfer(ggml_backend_vk_context * ctx, size_t ne, bool
4102
  }
4103
 
4104
  static void ggml_vk_quantize_data(const float * from, void * to, size_t ne, ggml_type quant) {
4105
- std::vector<int64_t> hist_cur(1 << 4, 0);
4106
-
4107
- switch(quant) {
4108
- case GGML_TYPE_F32:
4109
- memcpy(to, from, sizeof(float) * ne);
4110
- break;
4111
- case GGML_TYPE_Q4_0:
4112
- ggml_quantize_q4_0(from, to, ne, ne, hist_cur.data());
4113
- break;
4114
- case GGML_TYPE_Q4_1:
4115
- ggml_quantize_q4_1(from, to, ne, ne, hist_cur.data());
4116
- break;
4117
- case GGML_TYPE_Q5_0:
4118
- ggml_quantize_q5_0(from, to, ne, ne, hist_cur.data());
4119
- break;
4120
- case GGML_TYPE_Q5_1:
4121
- ggml_quantize_q5_1(from, to, ne, ne, hist_cur.data());
4122
- break;
4123
- case GGML_TYPE_Q8_0:
4124
- ggml_quantize_q8_0(from, to, ne, ne, hist_cur.data());
4125
- break;
4126
- case GGML_TYPE_Q2_K:
4127
- ggml_quantize_q2_K(from, to, ne, ne, hist_cur.data());
4128
- break;
4129
- case GGML_TYPE_Q3_K:
4130
- ggml_quantize_q3_K(from, to, ne, ne, hist_cur.data());
4131
- break;
4132
- case GGML_TYPE_Q4_K:
4133
- ggml_quantize_q4_K(from, to, ne, ne, hist_cur.data());
4134
- break;
4135
- case GGML_TYPE_Q5_K:
4136
- ggml_quantize_q5_K(from, to, ne, ne, hist_cur.data());
4137
- break;
4138
- case GGML_TYPE_Q6_K:
4139
- ggml_quantize_q6_K(from, to, ne, ne, hist_cur.data());
4140
- break;
4141
- default:
4142
- GGML_ASSERT(false);
4143
- }
4144
  }
4145
 
4146
  static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_type quant) {
 
4102
  }
4103
 
4104
  static void ggml_vk_quantize_data(const float * from, void * to, size_t ne, ggml_type quant) {
4105
+ ggml_quantize_chunk(quant, from, to, 0, 1, ne, nullptr);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4106
  }
4107
 
4108
  static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_type quant) {
ggml.c CHANGED
@@ -20159,133 +20159,6 @@ void ggml_quantize_free(void) {
20159
  ggml_critical_section_end();
20160
  }
20161
 
20162
- size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist) {
20163
- assert(k % QK4_0 == 0);
20164
- const int nb = k / QK4_0;
20165
-
20166
- for (int b = 0; b < n; b += k) {
20167
- block_q4_0 * restrict y = (block_q4_0 *) dst + b/QK4_0;
20168
-
20169
- quantize_row_q4_0_reference(src + b, y, k);
20170
-
20171
- for (int i = 0; i < nb; i++) {
20172
- for (int j = 0; j < QK4_0; j += 2) {
20173
- const uint8_t vi0 = y[i].qs[j/2] & 0x0F;
20174
- const uint8_t vi1 = y[i].qs[j/2] >> 4;
20175
-
20176
- hist[vi0]++;
20177
- hist[vi1]++;
20178
- }
20179
- }
20180
- }
20181
-
20182
- return (n/QK4_0*sizeof(block_q4_0));
20183
- }
20184
-
20185
- size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist) {
20186
- assert(k % QK4_1 == 0);
20187
- const int nb = k / QK4_1;
20188
-
20189
- for (int b = 0; b < n; b += k) {
20190
- block_q4_1 * restrict y = (block_q4_1 *) dst + b/QK4_1;
20191
-
20192
- quantize_row_q4_1_reference(src + b, y, k);
20193
-
20194
- for (int i = 0; i < nb; i++) {
20195
- for (int j = 0; j < QK4_1; j += 2) {
20196
- const uint8_t vi0 = y[i].qs[j/2] & 0x0F;
20197
- const uint8_t vi1 = y[i].qs[j/2] >> 4;
20198
-
20199
- hist[vi0]++;
20200
- hist[vi1]++;
20201
- }
20202
- }
20203
- }
20204
-
20205
- return (n/QK4_1*sizeof(block_q4_1));
20206
- }
20207
-
20208
- size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist) {
20209
- assert(k % QK5_0 == 0);
20210
- const int nb = k / QK5_0;
20211
-
20212
- for (int b = 0; b < n; b += k) {
20213
- block_q5_0 * restrict y = (block_q5_0 *)dst + b/QK5_0;
20214
-
20215
- quantize_row_q5_0_reference(src + b, y, k);
20216
-
20217
- for (int i = 0; i < nb; i++) {
20218
- uint32_t qh;
20219
- memcpy(&qh, &y[i].qh, sizeof(qh));
20220
-
20221
- for (int j = 0; j < QK5_0; j += 2) {
20222
- const uint8_t vh0 = ((qh & (1u << (j/2 + 0 ))) >> (j/2 + 0 )) << 4;
20223
- const uint8_t vh1 = ((qh & (1u << (j/2 + 16))) >> (j/2 + 12));
20224
-
20225
- // cast to 16 bins
20226
- const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
20227
- const uint8_t vi1 = ((y[i].qs[j/2] >> 4) | vh1) / 2;
20228
-
20229
- hist[vi0]++;
20230
- hist[vi1]++;
20231
- }
20232
- }
20233
- }
20234
-
20235
- return (n/QK5_0*sizeof(block_q5_0));
20236
- }
20237
-
20238
- size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist) {
20239
- assert(k % QK5_1 == 0);
20240
- const int nb = k / QK5_1;
20241
-
20242
- for (int b = 0; b < n; b += k) {
20243
- block_q5_1 * restrict y = (block_q5_1 *)dst + b/QK5_1;
20244
-
20245
- quantize_row_q5_1_reference(src + b, y, k);
20246
-
20247
- for (int i = 0; i < nb; i++) {
20248
- uint32_t qh;
20249
- memcpy(&qh, &y[i].qh, sizeof(qh));
20250
-
20251
- for (int j = 0; j < QK5_1; j += 2) {
20252
- const uint8_t vh0 = ((qh & (1u << (j/2 + 0 ))) >> (j/2 + 0 )) << 4;
20253
- const uint8_t vh1 = ((qh & (1u << (j/2 + 16))) >> (j/2 + 12));
20254
-
20255
- // cast to 16 bins
20256
- const uint8_t vi0 = ((y[i].qs[j/2] & 0x0F) | vh0) / 2;
20257
- const uint8_t vi1 = ((y[i].qs[j/2] >> 4) | vh1) / 2;
20258
-
20259
- hist[vi0]++;
20260
- hist[vi1]++;
20261
- }
20262
- }
20263
- }
20264
-
20265
- return (n/QK5_1*sizeof(block_q5_1));
20266
- }
20267
-
20268
- size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist) {
20269
- assert(k % QK8_0 == 0);
20270
- const int nb = k / QK8_0;
20271
-
20272
- for (int b = 0; b < n; b += k) {
20273
- block_q8_0 * restrict y = (block_q8_0 *)dst + b/QK8_0;
20274
-
20275
- quantize_row_q8_0_reference(src + b, y, k);
20276
-
20277
- for (int i = 0; i < nb; i++) {
20278
- for (int j = 0; j < QK8_0; ++j) {
20279
- const int8_t vi = y[i].qs[j];
20280
-
20281
- hist[vi/16 + 8]++;
20282
- }
20283
- }
20284
- }
20285
-
20286
- return (n/QK8_0*sizeof(block_q8_0));
20287
- }
20288
-
20289
  bool ggml_quantize_requires_imatrix(enum ggml_type type) {
20290
  return
20291
  type == GGML_TYPE_IQ2_XXS ||
@@ -20293,177 +20166,52 @@ bool ggml_quantize_requires_imatrix(enum ggml_type type) {
20293
  type == GGML_TYPE_IQ1_S;
20294
  }
20295
 
20296
- size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start,
20297
- int nrows, int n_per_row, int64_t * hist, const float * imatrix) {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20298
  ggml_quantize_init(type); // this is noop if already initialized
 
 
 
 
20299
  size_t result = 0;
20300
- int n = nrows * n_per_row;
20301
  switch (type) {
20302
- case GGML_TYPE_Q4_0:
20303
- {
20304
- GGML_ASSERT(start % QK4_0 == 0);
20305
- GGML_ASSERT(start % n_per_row == 0);
20306
- size_t start_row = start / n_per_row;
20307
- size_t row_size = ggml_row_size(type, n_per_row);
20308
- result = quantize_q4_0(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
20309
- GGML_ASSERT(result == row_size * nrows);
20310
- } break;
20311
- case GGML_TYPE_Q4_1:
20312
- {
20313
- GGML_ASSERT(start % QK4_1 == 0);
20314
- GGML_ASSERT(start % n_per_row == 0);
20315
- size_t start_row = start / n_per_row;
20316
- size_t row_size = ggml_row_size(type, n_per_row);
20317
- result = quantize_q4_1(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
20318
- GGML_ASSERT(result == row_size * nrows);
20319
- } break;
20320
- case GGML_TYPE_Q5_0:
20321
- {
20322
- GGML_ASSERT(start % QK5_0 == 0);
20323
- GGML_ASSERT(start % n_per_row == 0);
20324
- size_t start_row = start / n_per_row;
20325
- size_t row_size = ggml_row_size(type, n_per_row);
20326
- result = quantize_q5_0(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
20327
- GGML_ASSERT(result == row_size * nrows);
20328
- } break;
20329
- case GGML_TYPE_Q5_1:
20330
- {
20331
- GGML_ASSERT(start % QK5_1 == 0);
20332
- GGML_ASSERT(start % n_per_row == 0);
20333
- size_t start_row = start / n_per_row;
20334
- size_t row_size = ggml_row_size(type, n_per_row);
20335
- result = quantize_q5_1(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
20336
- GGML_ASSERT(result == row_size * nrows);
20337
- } break;
20338
- case GGML_TYPE_Q8_0:
20339
- {
20340
- GGML_ASSERT(start % QK8_0 == 0);
20341
- block_q8_0 * block = (block_q8_0*)dst + start / QK8_0;
20342
- result = ggml_quantize_q8_0(src + start, block, n, n, hist);
20343
- } break;
20344
- case GGML_TYPE_Q2_K:
20345
- {
20346
- GGML_ASSERT(start % QK_K == 0);
20347
- GGML_ASSERT(start % n_per_row == 0);
20348
- size_t start_row = start / n_per_row;
20349
- size_t row_size = ggml_row_size(type, n_per_row);
20350
- result = quantize_q2_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
20351
- GGML_ASSERT(result == row_size * nrows);
20352
- } break;
20353
- case GGML_TYPE_Q3_K:
20354
- {
20355
- GGML_ASSERT(start % QK_K == 0);
20356
- GGML_ASSERT(start % n_per_row == 0);
20357
- size_t start_row = start / n_per_row;
20358
- size_t row_size = ggml_row_size(type, n_per_row);
20359
- result = quantize_q3_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
20360
- GGML_ASSERT(result == row_size * nrows);
20361
- } break;
20362
- case GGML_TYPE_Q4_K:
20363
- {
20364
- GGML_ASSERT(start % QK_K == 0);
20365
- GGML_ASSERT(start % n_per_row == 0);
20366
- size_t start_row = start / n_per_row;
20367
- size_t row_size = ggml_row_size(type, n_per_row);
20368
- result = quantize_q4_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
20369
- GGML_ASSERT(result == row_size * nrows);
20370
- } break;
20371
- case GGML_TYPE_Q5_K:
20372
- {
20373
- GGML_ASSERT(start % QK_K == 0);
20374
- GGML_ASSERT(start % n_per_row == 0);
20375
- size_t start_row = start / n_per_row;
20376
- size_t row_size = ggml_row_size(type, n_per_row);
20377
- result = quantize_q5_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
20378
- GGML_ASSERT(result == row_size * nrows);
20379
- } break;
20380
- case GGML_TYPE_Q6_K:
20381
- {
20382
- GGML_ASSERT(start % QK_K == 0);
20383
- GGML_ASSERT(start % n_per_row == 0);
20384
- size_t start_row = start / n_per_row;
20385
- size_t row_size = ggml_row_size(type, n_per_row);
20386
- result = quantize_q6_K(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
20387
- GGML_ASSERT(result == row_size * nrows);
20388
- } break;
20389
- case GGML_TYPE_IQ2_XXS:
20390
- {
20391
- GGML_ASSERT(start % QK_K == 0);
20392
- GGML_ASSERT(start % n_per_row == 0);
20393
- GGML_ASSERT(imatrix);
20394
- size_t start_row = start / n_per_row;
20395
- size_t row_size = ggml_row_size(type, n_per_row);
20396
- result = quantize_iq2_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
20397
- GGML_ASSERT(result == row_size * nrows);
20398
- } break;
20399
- case GGML_TYPE_IQ2_XS:
20400
- {
20401
- GGML_ASSERT(start % QK_K == 0);
20402
- GGML_ASSERT(start % n_per_row == 0);
20403
- GGML_ASSERT(imatrix);
20404
- size_t start_row = start / n_per_row;
20405
- size_t row_size = ggml_row_size(type, n_per_row);
20406
- result = quantize_iq2_xs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
20407
- GGML_ASSERT(result == row_size * nrows);
20408
- } break;
20409
- case GGML_TYPE_IQ3_XXS:
20410
- {
20411
- GGML_ASSERT(start % QK_K == 0);
20412
- GGML_ASSERT(start % n_per_row == 0);
20413
- size_t start_row = start / n_per_row;
20414
- size_t row_size = ggml_row_size(type, n_per_row);
20415
- result = quantize_iq3_xxs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
20416
- GGML_ASSERT(result == row_size * nrows);
20417
- } break;
20418
- case GGML_TYPE_IQ3_S:
20419
- {
20420
- GGML_ASSERT(start % QK_K == 0);
20421
- GGML_ASSERT(start % n_per_row == 0);
20422
- size_t start_row = start / n_per_row;
20423
- size_t row_size = ggml_row_size(type, n_per_row);
20424
- result = quantize_iq3_s(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
20425
- GGML_ASSERT(result == row_size * nrows);
20426
- } break;
20427
- case GGML_TYPE_IQ2_S:
20428
- {
20429
- GGML_ASSERT(start % QK_K == 0);
20430
- GGML_ASSERT(start % n_per_row == 0);
20431
- size_t start_row = start / n_per_row;
20432
- size_t row_size = ggml_row_size(type, n_per_row);
20433
- result = quantize_iq2_s(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
20434
- GGML_ASSERT(result == row_size * nrows);
20435
- } break;
20436
- case GGML_TYPE_IQ1_S:
20437
- {
20438
- GGML_ASSERT(start % QK_K == 0);
20439
- GGML_ASSERT(start % n_per_row == 0);
20440
- size_t start_row = start / n_per_row;
20441
- size_t row_size = ggml_row_size(type, n_per_row);
20442
- result = quantize_iq1_s(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
20443
- GGML_ASSERT(result == row_size * nrows);
20444
- } break;
20445
- case GGML_TYPE_IQ4_NL:
20446
  #if QK_K == 64
20447
- case GGML_TYPE_IQ4_XS:
20448
- #endif
20449
- {
20450
- GGML_ASSERT(start % QK4_NL == 0);
20451
- GGML_ASSERT(start % n_per_row == 0);
20452
- size_t start_row = start / n_per_row;
20453
- size_t row_size = ggml_row_size(type, n_per_row);
20454
- result = quantize_iq4_nl(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
20455
- GGML_ASSERT(result == row_size * nrows);
20456
- } break;
20457
- #if QK_K != 64
20458
- case GGML_TYPE_IQ4_XS:
20459
- {
20460
- GGML_ASSERT(start % QK_K == 0);
20461
- GGML_ASSERT(start % n_per_row == 0);
20462
- size_t start_row = start / n_per_row;
20463
- size_t row_size = ggml_row_size(type, n_per_row);
20464
- result = quantize_iq4_xs(src + start, (char *)dst + start_row * row_size, nrows, n_per_row, hist, imatrix);
20465
- GGML_ASSERT(result == row_size * nrows);
20466
- } break;
20467
  #endif
20468
  case GGML_TYPE_F16:
20469
  {
@@ -20480,6 +20228,9 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
20480
  default:
20481
  assert(false);
20482
  }
 
 
 
20483
  return result;
20484
  }
20485
 
 
20159
  ggml_critical_section_end();
20160
  }
20161
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20162
  bool ggml_quantize_requires_imatrix(enum ggml_type type) {
20163
  return
20164
  type == GGML_TYPE_IQ2_XXS ||
 
20166
  type == GGML_TYPE_IQ1_S;
20167
  }
20168
 
20169
+ size_t ggml_quantize_chunk(
20170
+ enum ggml_type type,
20171
+ const float * src,
20172
+ void * dst,
20173
+ int start,
20174
+ int nrows,
20175
+ int n_per_row,
20176
+ const float * imatrix) {
20177
+ const int n = nrows * n_per_row;
20178
+
20179
+ if (ggml_quantize_requires_imatrix(type)) {
20180
+ GGML_ASSERT(imatrix != NULL);
20181
+ }
20182
+
20183
+ GGML_ASSERT(start % type_traits[type].blck_size == 0);
20184
+ GGML_ASSERT(start % n_per_row == 0);
20185
+
20186
  ggml_quantize_init(type); // this is noop if already initialized
20187
+
20188
+ const size_t start_row = start / n_per_row;
20189
+ const size_t row_size = ggml_row_size(type, n_per_row);
20190
+
20191
  size_t result = 0;
20192
+
20193
  switch (type) {
20194
+ case GGML_TYPE_Q4_0: result = quantize_q4_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20195
+ case GGML_TYPE_Q4_1: result = quantize_q4_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20196
+ case GGML_TYPE_Q5_0: result = quantize_q5_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20197
+ case GGML_TYPE_Q5_1: result = quantize_q5_1(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20198
+ case GGML_TYPE_Q8_0: result = quantize_q8_0(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20199
+ case GGML_TYPE_Q2_K: result = quantize_q2_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20200
+ case GGML_TYPE_Q3_K: result = quantize_q3_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20201
+ case GGML_TYPE_Q4_K: result = quantize_q4_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20202
+ case GGML_TYPE_Q5_K: result = quantize_q5_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20203
+ case GGML_TYPE_Q6_K: result = quantize_q6_K(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20204
+ case GGML_TYPE_IQ2_XXS: result = quantize_iq2_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20205
+ case GGML_TYPE_IQ2_XS: result = quantize_iq2_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20206
+ case GGML_TYPE_IQ3_XXS: result = quantize_iq3_xxs(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20207
+ case GGML_TYPE_IQ3_S: result = quantize_iq3_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20208
+ case GGML_TYPE_IQ2_S: result = quantize_iq2_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20209
+ case GGML_TYPE_IQ1_S: result = quantize_iq1_s (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20210
+ case GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20211
  #if QK_K == 64
20212
+ case GGML_TYPE_IQ4_XS: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
20213
+ #else
20214
+ case GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20215
  #endif
20216
  case GGML_TYPE_F16:
20217
  {
 
20228
  default:
20229
  assert(false);
20230
  }
20231
+
20232
+ GGML_ASSERT(result == nrows * row_size);
20233
+
20234
  return result;
20235
  }
20236
 
ggml.h CHANGED
@@ -2194,25 +2194,18 @@ extern "C" {
2194
  GGML_API void ggml_quantize_init(enum ggml_type type);
2195
  GGML_API void ggml_quantize_free(void);
2196
 
2197
- // TODO: these would probably get removed in favor of the more general ggml_quantize_chunk
2198
- GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
2199
- GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
2200
- GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
2201
- GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
2202
- GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
2203
-
2204
- GGML_API size_t ggml_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist);
2205
- GGML_API size_t ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist);
2206
- GGML_API size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
2207
- GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
2208
- GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
2209
-
2210
  // some quantization type cannot be used without an importance matrix
2211
  GGML_API bool ggml_quantize_requires_imatrix(enum ggml_type type);
2212
 
2213
  // calls ggml_quantize_init internally (i.e. can allocate memory)
2214
- GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst,
2215
- int start, int nrows, int n_per_row, int64_t * hist, const float * imatrix);
 
 
 
 
 
 
2216
 
2217
  //
2218
  // gguf
 
2194
  GGML_API void ggml_quantize_init(enum ggml_type type);
2195
  GGML_API void ggml_quantize_free(void);
2196
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2197
  // some quantization type cannot be used without an importance matrix
2198
  GGML_API bool ggml_quantize_requires_imatrix(enum ggml_type type);
2199
 
2200
  // calls ggml_quantize_init internally (i.e. can allocate memory)
2201
+ GGML_API size_t ggml_quantize_chunk(
2202
+ enum ggml_type type,
2203
+ const float * src,
2204
+ void * dst,
2205
+ int start,
2206
+ int nrows,
2207
+ int n_per_row,
2208
+ const float * imatrix);
2209
 
2210
  //
2211
  // gguf