ggerganov commited on
Commit
e0c6dff
·
1 Parent(s): 374488a

ggml : minor naming changes (llama/8433)

Browse files

* ggml : minor naming changes

ggml-ci

* ggml : use PRId64 [no ci]

* ggml : revert FA K/Q names

ggml/include/ggml.h CHANGED
@@ -714,9 +714,9 @@ extern "C" {
714
  GGML_API GGML_CALL size_t ggml_nbytes (const struct ggml_tensor * tensor);
715
  GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
716
 
717
- GGML_API GGML_CALL int ggml_blck_size(enum ggml_type type);
718
- GGML_API GGML_CALL size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
719
- GGML_API GGML_CALL size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
720
 
721
  GGML_DEPRECATED(
722
  GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
@@ -2410,31 +2410,31 @@ extern "C" {
2410
  #endif
2411
  typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
2412
  typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
2413
- typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
2414
- const void * GGML_RESTRICT y, size_t by, int nrc);
2415
- typedef void (*ggml_from_float_to_mat_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nr,
2416
- int64_t k, int64_t bx);
2417
- typedef void (*ggml_gemv_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
2418
- const void * GGML_RESTRICT y, int nr, int nc);
2419
- typedef void (*ggml_gemm_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
2420
- const void * GGML_RESTRICT y, int nr, int nc);
2421
 
2422
  typedef struct {
2423
- const char * type_name;
2424
- int blck_size;
2425
- size_t type_size;
2426
- bool is_quantized;
2427
- ggml_to_float_t to_float;
2428
- ggml_from_float_t from_float;
2429
- ggml_from_float_t from_float_reference;
2430
- ggml_vec_dot_t vec_dot;
2431
- enum ggml_type vec_dot_type;
2432
- int64_t nrows; // number of rows to process simultaneously;
2433
- int64_t ncols; // number of columns to process simultaneously;
2434
- int64_t interleave_blcksize; // interleave elements in blocks of interleave_blcksize;
2435
  ggml_from_float_to_mat_t from_float_to_mat;
2436
- ggml_gemv_t gemv;
2437
- ggml_gemm_t gemm;
 
 
 
 
2438
  } ggml_type_traits_t;
2439
 
2440
  GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
 
714
  GGML_API GGML_CALL size_t ggml_nbytes (const struct ggml_tensor * tensor);
715
  GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
716
 
717
+ GGML_API GGML_CALL int64_t ggml_blck_size(enum ggml_type type);
718
+ GGML_API GGML_CALL size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
719
+ GGML_API GGML_CALL size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
720
 
721
  GGML_DEPRECATED(
722
  GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
 
2410
  #endif
2411
  typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
2412
  typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
2413
+ typedef void (*ggml_from_float_to_mat_t)
2414
+ (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nr, int64_t k, int64_t bs);
2415
+ typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
2416
+ const void * GGML_RESTRICT y, size_t by, int nrc);
2417
+ typedef void (*ggml_gemv_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
2418
+ const void * GGML_RESTRICT y, int nr, int nc);
2419
+ typedef void (*ggml_gemm_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
2420
+ const void * GGML_RESTRICT y, int nr, int nc);
2421
 
2422
  typedef struct {
2423
+ const char * type_name;
2424
+ int64_t blck_size;
2425
+ int64_t blck_size_interleave; // interleave elements in blocks
2426
+ size_t type_size;
2427
+ bool is_quantized;
2428
+ ggml_to_float_t to_float;
2429
+ ggml_from_float_t from_float;
2430
+ ggml_from_float_t from_float_ref;
 
 
 
 
2431
  ggml_from_float_to_mat_t from_float_to_mat;
2432
+ ggml_vec_dot_t vec_dot;
2433
+ enum ggml_type vec_dot_type;
2434
+ int64_t nrows; // number of rows to process simultaneously
2435
+ int64_t ncols; // number of columns to process simultaneously
2436
+ ggml_gemv_t gemv;
2437
+ ggml_gemm_t gemm;
2438
  } ggml_type_traits_t;
2439
 
2440
  GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
ggml/src/ggml-quants.c CHANGED
@@ -658,7 +658,7 @@ static inline __m128i packNibbles( __m256i bytes ) {
658
  #endif //__loongarch_asx
659
 
660
  // reference implementation for deterministic creation of model files
661
- void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int64_t k) {
662
  static const int qk = QK4_0;
663
 
664
  assert(k % qk == 0);
@@ -696,11 +696,11 @@ void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict
696
  }
697
 
698
  void quantize_row_q4_0(const float * restrict x, void * restrict y, int64_t k) {
699
- quantize_row_q4_0_reference(x, y, k);
700
  }
701
 
702
 
703
- void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int64_t k) {
704
  const int qk = QK4_1;
705
 
706
  assert(k % qk == 0);
@@ -738,10 +738,10 @@ void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict
738
  }
739
 
740
  void quantize_row_q4_1(const float * restrict x, void * restrict y, int64_t k) {
741
- quantize_row_q4_1_reference(x, y, k);
742
  }
743
 
744
- void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int64_t k) {
745
  static const int qk = QK5_0;
746
 
747
  assert(k % qk == 0);
@@ -786,10 +786,10 @@ void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict
786
  }
787
 
788
  void quantize_row_q5_0(const float * restrict x, void * restrict y, int64_t k) {
789
- quantize_row_q5_0_reference(x, y, k);
790
  }
791
 
792
- void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int64_t k) {
793
  const int qk = QK5_1;
794
 
795
  assert(k % qk == 0);
@@ -834,11 +834,11 @@ void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict
834
  }
835
 
836
  void quantize_row_q5_1(const float * restrict x, void * restrict y, int64_t k) {
837
- quantize_row_q5_1_reference(x, y, k);
838
  }
839
 
840
  // reference implementation for deterministic creation of model files
841
- void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int64_t k) {
842
  assert(k % QK8_0 == 0);
843
  const int nb = k / QK8_0;
844
 
@@ -1144,12 +1144,12 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
1144
  #else
1145
  GGML_UNUSED(nb);
1146
  // scalar
1147
- quantize_row_q8_0_reference(x, y, k);
1148
  #endif
1149
  }
1150
 
1151
  // reference implementation for deterministic creation of model files
1152
- void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int64_t k) {
1153
  assert(QK8_1 == 32);
1154
  assert(k % QK8_1 == 0);
1155
  const int nb = k / QK8_1;
@@ -1508,7 +1508,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k)
1508
  #else
1509
  GGML_UNUSED(nb);
1510
  // scalar
1511
- quantize_row_q8_1_reference(x, y, k);
1512
  #endif
1513
  }
1514
 
@@ -1899,7 +1899,7 @@ static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t *
1899
 
1900
  //========================- 2-bit (de)-quantization
1901
 
1902
- void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int64_t k) {
1903
  assert(k % QK_K == 0);
1904
  const int nb = k / QK_K;
1905
 
@@ -2002,7 +2002,7 @@ void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int6
2002
  }
2003
 
2004
  void quantize_row_q2_K(const float * restrict x, void * restrict vy, int64_t k) {
2005
- quantize_row_q2_K_reference(x, vy, k);
2006
  }
2007
 
2008
  static float make_qkx3_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
@@ -2226,7 +2226,7 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri
2226
  size_t quantize_q2_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
2227
  size_t row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row);
2228
  if (!quant_weights) {
2229
- quantize_row_q2_K_reference(src, dst, (int64_t)nrow*n_per_row);
2230
  }
2231
  else {
2232
  char * qrow = (char *)dst;
@@ -2241,7 +2241,7 @@ size_t quantize_q2_K(const float * restrict src, void * restrict dst, int64_t nr
2241
 
2242
  //========================= 3-bit (de)-quantization
2243
 
2244
- void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int64_t k) {
2245
  assert(k % QK_K == 0);
2246
  const int nb = k / QK_K;
2247
 
@@ -2368,7 +2368,7 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int6
2368
  }
2369
 
2370
  void quantize_row_q3_K(const float * restrict x, void * restrict vy, int64_t k) {
2371
- quantize_row_q3_K_reference(x, vy, k);
2372
  }
2373
 
2374
  static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restrict y, int64_t n_per_row, const float * restrict quant_weights) {
@@ -2458,7 +2458,7 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri
2458
  size_t quantize_q3_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
2459
  size_t row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row);
2460
  if (!quant_weights) {
2461
- quantize_row_q3_K_reference(src, dst, (int64_t)nrow*n_per_row);
2462
  }
2463
  else {
2464
  char * qrow = (char *)dst;
@@ -2473,7 +2473,7 @@ size_t quantize_q3_K(const float * restrict src, void * restrict dst, int64_t nr
2473
 
2474
  // ====================== 4-bit (de)-quantization
2475
 
2476
- void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int64_t k) {
2477
  assert(k % QK_K == 0);
2478
  const int nb = k / QK_K;
2479
 
@@ -2572,7 +2572,7 @@ void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int6
2572
  void quantize_row_q4_K(const float * restrict x, void * restrict vy, int64_t k) {
2573
  assert(k % QK_K == 0);
2574
  block_q4_K * restrict y = vy;
2575
- quantize_row_q4_K_reference(x, y, k);
2576
  }
2577
 
2578
  static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restrict y, int64_t n_per_row, const float * quant_weights) {
@@ -2651,7 +2651,7 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri
2651
  size_t quantize_q4_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
2652
  size_t row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row);
2653
  if (!quant_weights) {
2654
- quantize_row_q4_K_reference(src, dst, (int64_t)nrow*n_per_row);
2655
  }
2656
  else {
2657
  char * qrow = (char *)dst;
@@ -2666,7 +2666,7 @@ size_t quantize_q4_K(const float * restrict src, void * restrict dst, int64_t nr
2666
 
2667
  // ====================== 5-bit (de)-quantization
2668
 
2669
- void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int64_t k) {
2670
  assert(k % QK_K == 0);
2671
  const int64_t nb = k / QK_K;
2672
 
@@ -2783,7 +2783,7 @@ void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int6
2783
  void quantize_row_q5_K(const float * restrict x, void * restrict vy, int64_t k) {
2784
  assert(k % QK_K == 0);
2785
  block_q5_K * restrict y = vy;
2786
- quantize_row_q5_K_reference(x, y, k);
2787
  }
2788
 
2789
  static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restrict y, int64_t n_per_row, const float * quant_weights) {
@@ -2882,7 +2882,7 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri
2882
  size_t quantize_q5_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
2883
  size_t row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row);
2884
  if (!quant_weights) {
2885
- quantize_row_q5_K_reference(src, dst, (int64_t)nrow*n_per_row);
2886
  }
2887
  else {
2888
  char * qrow = (char *)dst;
@@ -2897,7 +2897,7 @@ size_t quantize_q5_K(const float * restrict src, void * restrict dst, int64_t nr
2897
 
2898
  // ====================== 6-bit (de)-quantization
2899
 
2900
- void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int64_t k) {
2901
  assert(k % QK_K == 0);
2902
  const int64_t nb = k / QK_K;
2903
 
@@ -3001,7 +3001,7 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int6
3001
  void quantize_row_q6_K(const float * restrict x, void * restrict vy, int64_t k) {
3002
  assert(k % QK_K == 0);
3003
  block_q6_K * restrict y = vy;
3004
- quantize_row_q6_K_reference(x, y, k);
3005
  }
3006
 
3007
  static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restrict y, int64_t n_per_row, const float * quant_weights) {
@@ -3091,7 +3091,7 @@ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restri
3091
  size_t quantize_q6_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
3092
  size_t row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row);
3093
  if (!quant_weights) {
3094
- quantize_row_q6_K_reference(src, dst, (int64_t)nrow*n_per_row);
3095
  }
3096
  else {
3097
  char * qrow = (char *)dst;
@@ -3108,7 +3108,7 @@ static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restri
3108
  static_assert(QK4_0 == 32, "QK4_0 must be 32");
3109
 
3110
  if (!quant_weights) {
3111
- quantize_row_q4_0_reference(x, y, n_per_row);
3112
  return;
3113
  }
3114
 
@@ -3134,7 +3134,7 @@ static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restri
3134
 
3135
  size_t quantize_q4_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
3136
  if (!quant_weights) {
3137
- quantize_row_q4_0_reference(src, dst, (int64_t)nrow*n_per_row);
3138
  return nrow * ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
3139
  }
3140
  size_t row_size = ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
@@ -3151,7 +3151,7 @@ static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restri
3151
  static_assert(QK4_1 == 32, "QK4_1 must be 32");
3152
 
3153
  if (!quant_weights) {
3154
- quantize_row_q4_1_reference(x, y, n_per_row);
3155
  return;
3156
  }
3157
 
@@ -3179,7 +3179,7 @@ static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restri
3179
 
3180
  size_t quantize_q4_1(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
3181
  if (!quant_weights) {
3182
- quantize_row_q4_1_reference(src, dst, (int64_t)nrow*n_per_row);
3183
  return nrow * ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
3184
  }
3185
  size_t row_size = ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
@@ -3196,7 +3196,7 @@ static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restri
3196
  static_assert(QK5_0 == 32, "QK5_0 must be 32");
3197
 
3198
  if (!quant_weights) {
3199
- quantize_row_q5_0_reference(x, y, n_per_row);
3200
  return;
3201
  }
3202
 
@@ -3233,7 +3233,7 @@ static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restri
3233
 
3234
  size_t quantize_q5_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
3235
  if (!quant_weights) {
3236
- quantize_row_q5_0_reference(src, dst, (int64_t)nrow*n_per_row);
3237
  return nrow * ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
3238
  }
3239
  size_t row_size = ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
@@ -3250,7 +3250,7 @@ static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restri
3250
  static_assert(QK5_1 == 32, "QK5_1 must be 32");
3251
 
3252
  if (!quant_weights) {
3253
- quantize_row_q5_1_reference(x, y, n_per_row);
3254
  return;
3255
  }
3256
 
@@ -3286,7 +3286,7 @@ static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restri
3286
 
3287
  size_t quantize_q5_1(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
3288
  if (!quant_weights) {
3289
- quantize_row_q5_1_reference(src, dst, (int64_t)nrow*n_per_row);
3290
  return nrow * ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
3291
  }
3292
  size_t row_size = ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
@@ -3302,7 +3302,7 @@ size_t quantize_q5_1(const float * restrict src, void * restrict dst, int64_t nr
3302
  size_t quantize_q8_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
3303
  (void)quant_weights; // not used
3304
  const size_t row_size = ggml_row_size(GGML_TYPE_Q8_0, n_per_row);
3305
- quantize_row_q8_0_reference(src, dst, (int64_t)nrow*n_per_row);
3306
  return nrow * row_size;
3307
  }
3308
 
@@ -3590,7 +3590,7 @@ void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y,
3590
 
3591
  //===================================== Q8_K ==============================================
3592
 
3593
- void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int64_t k) {
3594
  assert(k % QK_K == 0);
3595
  const int64_t nb = k / QK_K;
3596
 
@@ -3641,7 +3641,7 @@ void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int6
3641
  }
3642
 
3643
  void quantize_row_q8_K(const float * restrict x, void * restrict y, int64_t k) {
3644
- quantize_row_q8_K_reference(x, y, k);
3645
  }
3646
 
3647
  //===================================== Dot ptoducts =================================
@@ -13542,10 +13542,10 @@ size_t quantize_iq3_xxs(const float * restrict src, void * restrict dst, int64_t
13542
  void quantize_row_iq3_xxs(const float * restrict x, void * restrict vy, int64_t k) {
13543
  assert(k % QK_K == 0);
13544
  block_iq3_xxs * restrict y = vy;
13545
- quantize_row_iq3_xxs_reference(x, y, k);
13546
  }
13547
 
13548
- void quantize_row_iq3_xxs_reference(const float * restrict x, block_iq3_xxs * restrict y, int64_t k) {
13549
  assert(k % QK_K == 0);
13550
  quantize_row_iq3_xxs_impl(256, x, y, k, NULL);
13551
  }
@@ -13758,10 +13758,10 @@ size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int64_t n
13758
  void quantize_row_iq3_s(const float * restrict x, void * restrict vy, int64_t k) {
13759
  assert(k % QK_K == 0);
13760
  block_iq3_s * restrict y = vy;
13761
- quantize_row_iq3_s_reference(x, y, k);
13762
  }
13763
 
13764
- void quantize_row_iq3_s_reference(const float * restrict x, block_iq3_s * restrict y, int64_t k) {
13765
  assert(k % QK_K == 0);
13766
  quantize_iq3_s(x, y, 1, k, NULL);
13767
  }
@@ -14499,7 +14499,7 @@ void quantize_row_iq4_nl(const float * restrict x, void * restrict vy, int64_t k
14499
  }
14500
  }
14501
 
14502
- void quantize_row_iq4_nl_reference(const float * restrict x, block_iq4_nl * restrict y, int64_t k) {
14503
  assert(k % QK4_NL == 0);
14504
  quantize_row_iq4_nl(x, y, k);
14505
  }
@@ -14527,10 +14527,10 @@ size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int64_t
14527
  void quantize_row_iq4_xs(const float * restrict x, void * restrict vy, int64_t k) {
14528
  assert(k % QK_K == 0);
14529
  block_iq4_xs * restrict y = vy;
14530
- quantize_row_iq4_xs_reference(x, y, k);
14531
  }
14532
 
14533
- void quantize_row_iq4_xs_reference(const float * restrict x, block_iq4_xs * restrict y, int64_t k) {
14534
  assert(k % QK_K == 0);
14535
  quantize_iq4_xs(x, y, 1, k, NULL);
14536
  }
@@ -14717,7 +14717,7 @@ size_t quantize_iq2_s(const float * restrict src, void * restrict dst, int64_t n
14717
  return nrow * nblock * sizeof(block_iq2_s);
14718
  }
14719
 
14720
- void quantize_row_iq2_s_reference(const float * restrict x, block_iq2_s * restrict y, int64_t k) {
14721
  assert(k % QK_K == 0);
14722
  quantize_iq2_s(x, y, 1, k, NULL);
14723
  }
@@ -14725,7 +14725,7 @@ void quantize_row_iq2_s_reference(const float * restrict x, block_iq2_s * restri
14725
  void quantize_row_iq2_s(const float * restrict x, void * restrict vy, int64_t k) {
14726
  assert(k % QK_K == 0);
14727
  block_iq2_s * restrict y = vy;
14728
- quantize_row_iq2_s_reference(x, y, k);
14729
  }
14730
 
14731
  static bool validate_float(float f, size_t i) {
 
658
  #endif //__loongarch_asx
659
 
660
  // reference implementation for deterministic creation of model files
661
+ void quantize_row_q4_0_ref(const float * restrict x, block_q4_0 * restrict y, int64_t k) {
662
  static const int qk = QK4_0;
663
 
664
  assert(k % qk == 0);
 
696
  }
697
 
698
  void quantize_row_q4_0(const float * restrict x, void * restrict y, int64_t k) {
699
+ quantize_row_q4_0_ref(x, y, k);
700
  }
701
 
702
 
703
+ void quantize_row_q4_1_ref(const float * restrict x, block_q4_1 * restrict y, int64_t k) {
704
  const int qk = QK4_1;
705
 
706
  assert(k % qk == 0);
 
738
  }
739
 
740
  void quantize_row_q4_1(const float * restrict x, void * restrict y, int64_t k) {
741
+ quantize_row_q4_1_ref(x, y, k);
742
  }
743
 
744
+ void quantize_row_q5_0_ref(const float * restrict x, block_q5_0 * restrict y, int64_t k) {
745
  static const int qk = QK5_0;
746
 
747
  assert(k % qk == 0);
 
786
  }
787
 
788
  void quantize_row_q5_0(const float * restrict x, void * restrict y, int64_t k) {
789
+ quantize_row_q5_0_ref(x, y, k);
790
  }
791
 
792
+ void quantize_row_q5_1_ref(const float * restrict x, block_q5_1 * restrict y, int64_t k) {
793
  const int qk = QK5_1;
794
 
795
  assert(k % qk == 0);
 
834
  }
835
 
836
  void quantize_row_q5_1(const float * restrict x, void * restrict y, int64_t k) {
837
+ quantize_row_q5_1_ref(x, y, k);
838
  }
839
 
840
  // reference implementation for deterministic creation of model files
841
+ void quantize_row_q8_0_ref(const float * restrict x, block_q8_0 * restrict y, int64_t k) {
842
  assert(k % QK8_0 == 0);
843
  const int nb = k / QK8_0;
844
 
 
1144
  #else
1145
  GGML_UNUSED(nb);
1146
  // scalar
1147
+ quantize_row_q8_0_ref(x, y, k);
1148
  #endif
1149
  }
1150
 
1151
  // reference implementation for deterministic creation of model files
1152
+ void quantize_row_q8_1_ref(const float * restrict x, block_q8_1 * restrict y, int64_t k) {
1153
  assert(QK8_1 == 32);
1154
  assert(k % QK8_1 == 0);
1155
  const int nb = k / QK8_1;
 
1508
  #else
1509
  GGML_UNUSED(nb);
1510
  // scalar
1511
+ quantize_row_q8_1_ref(x, y, k);
1512
  #endif
1513
  }
1514
 
 
1899
 
1900
  //========================- 2-bit (de)-quantization
1901
 
1902
+ void quantize_row_q2_K_ref(const float * restrict x, block_q2_K * restrict y, int64_t k) {
1903
  assert(k % QK_K == 0);
1904
  const int nb = k / QK_K;
1905
 
 
2002
  }
2003
 
2004
  void quantize_row_q2_K(const float * restrict x, void * restrict vy, int64_t k) {
2005
+ quantize_row_q2_K_ref(x, vy, k);
2006
  }
2007
 
2008
  static float make_qkx3_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
 
2226
  size_t quantize_q2_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
2227
  size_t row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row);
2228
  if (!quant_weights) {
2229
+ quantize_row_q2_K_ref(src, dst, (int64_t)nrow*n_per_row);
2230
  }
2231
  else {
2232
  char * qrow = (char *)dst;
 
2241
 
2242
  //========================= 3-bit (de)-quantization
2243
 
2244
+ void quantize_row_q3_K_ref(const float * restrict x, block_q3_K * restrict y, int64_t k) {
2245
  assert(k % QK_K == 0);
2246
  const int nb = k / QK_K;
2247
 
 
2368
  }
2369
 
2370
  void quantize_row_q3_K(const float * restrict x, void * restrict vy, int64_t k) {
2371
+ quantize_row_q3_K_ref(x, vy, k);
2372
  }
2373
 
2374
  static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restrict y, int64_t n_per_row, const float * restrict quant_weights) {
 
2458
  size_t quantize_q3_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
2459
  size_t row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row);
2460
  if (!quant_weights) {
2461
+ quantize_row_q3_K_ref(src, dst, (int64_t)nrow*n_per_row);
2462
  }
2463
  else {
2464
  char * qrow = (char *)dst;
 
2473
 
2474
  // ====================== 4-bit (de)-quantization
2475
 
2476
+ void quantize_row_q4_K_ref(const float * restrict x, block_q4_K * restrict y, int64_t k) {
2477
  assert(k % QK_K == 0);
2478
  const int nb = k / QK_K;
2479
 
 
2572
  void quantize_row_q4_K(const float * restrict x, void * restrict vy, int64_t k) {
2573
  assert(k % QK_K == 0);
2574
  block_q4_K * restrict y = vy;
2575
+ quantize_row_q4_K_ref(x, y, k);
2576
  }
2577
 
2578
  static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restrict y, int64_t n_per_row, const float * quant_weights) {
 
2651
  size_t quantize_q4_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
2652
  size_t row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row);
2653
  if (!quant_weights) {
2654
+ quantize_row_q4_K_ref(src, dst, (int64_t)nrow*n_per_row);
2655
  }
2656
  else {
2657
  char * qrow = (char *)dst;
 
2666
 
2667
  // ====================== 5-bit (de)-quantization
2668
 
2669
+ void quantize_row_q5_K_ref(const float * restrict x, block_q5_K * restrict y, int64_t k) {
2670
  assert(k % QK_K == 0);
2671
  const int64_t nb = k / QK_K;
2672
 
 
2783
  void quantize_row_q5_K(const float * restrict x, void * restrict vy, int64_t k) {
2784
  assert(k % QK_K == 0);
2785
  block_q5_K * restrict y = vy;
2786
+ quantize_row_q5_K_ref(x, y, k);
2787
  }
2788
 
2789
  static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restrict y, int64_t n_per_row, const float * quant_weights) {
 
2882
  size_t quantize_q5_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
2883
  size_t row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row);
2884
  if (!quant_weights) {
2885
+ quantize_row_q5_K_ref(src, dst, (int64_t)nrow*n_per_row);
2886
  }
2887
  else {
2888
  char * qrow = (char *)dst;
 
2897
 
2898
  // ====================== 6-bit (de)-quantization
2899
 
2900
+ void quantize_row_q6_K_ref(const float * restrict x, block_q6_K * restrict y, int64_t k) {
2901
  assert(k % QK_K == 0);
2902
  const int64_t nb = k / QK_K;
2903
 
 
3001
  void quantize_row_q6_K(const float * restrict x, void * restrict vy, int64_t k) {
3002
  assert(k % QK_K == 0);
3003
  block_q6_K * restrict y = vy;
3004
+ quantize_row_q6_K_ref(x, y, k);
3005
  }
3006
 
3007
  static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restrict y, int64_t n_per_row, const float * quant_weights) {
 
3091
  size_t quantize_q6_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
3092
  size_t row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row);
3093
  if (!quant_weights) {
3094
+ quantize_row_q6_K_ref(src, dst, (int64_t)nrow*n_per_row);
3095
  }
3096
  else {
3097
  char * qrow = (char *)dst;
 
3108
  static_assert(QK4_0 == 32, "QK4_0 must be 32");
3109
 
3110
  if (!quant_weights) {
3111
+ quantize_row_q4_0_ref(x, y, n_per_row);
3112
  return;
3113
  }
3114
 
 
3134
 
3135
  size_t quantize_q4_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
3136
  if (!quant_weights) {
3137
+ quantize_row_q4_0_ref(src, dst, (int64_t)nrow*n_per_row);
3138
  return nrow * ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
3139
  }
3140
  size_t row_size = ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
 
3151
  static_assert(QK4_1 == 32, "QK4_1 must be 32");
3152
 
3153
  if (!quant_weights) {
3154
+ quantize_row_q4_1_ref(x, y, n_per_row);
3155
  return;
3156
  }
3157
 
 
3179
 
3180
  size_t quantize_q4_1(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
3181
  if (!quant_weights) {
3182
+ quantize_row_q4_1_ref(src, dst, (int64_t)nrow*n_per_row);
3183
  return nrow * ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
3184
  }
3185
  size_t row_size = ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
 
3196
  static_assert(QK5_0 == 32, "QK5_0 must be 32");
3197
 
3198
  if (!quant_weights) {
3199
+ quantize_row_q5_0_ref(x, y, n_per_row);
3200
  return;
3201
  }
3202
 
 
3233
 
3234
  size_t quantize_q5_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
3235
  if (!quant_weights) {
3236
+ quantize_row_q5_0_ref(src, dst, (int64_t)nrow*n_per_row);
3237
  return nrow * ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
3238
  }
3239
  size_t row_size = ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
 
3250
  static_assert(QK5_1 == 32, "QK5_1 must be 32");
3251
 
3252
  if (!quant_weights) {
3253
+ quantize_row_q5_1_ref(x, y, n_per_row);
3254
  return;
3255
  }
3256
 
 
3286
 
3287
  size_t quantize_q5_1(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
3288
  if (!quant_weights) {
3289
+ quantize_row_q5_1_ref(src, dst, (int64_t)nrow*n_per_row);
3290
  return nrow * ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
3291
  }
3292
  size_t row_size = ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
 
3302
  size_t quantize_q8_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
3303
  (void)quant_weights; // not used
3304
  const size_t row_size = ggml_row_size(GGML_TYPE_Q8_0, n_per_row);
3305
+ quantize_row_q8_0_ref(src, dst, (int64_t)nrow*n_per_row);
3306
  return nrow * row_size;
3307
  }
3308
 
 
3590
 
3591
  //===================================== Q8_K ==============================================
3592
 
3593
+ void quantize_row_q8_K_ref(const float * restrict x, block_q8_K * restrict y, int64_t k) {
3594
  assert(k % QK_K == 0);
3595
  const int64_t nb = k / QK_K;
3596
 
 
3641
  }
3642
 
3643
  void quantize_row_q8_K(const float * restrict x, void * restrict y, int64_t k) {
3644
+ quantize_row_q8_K_ref(x, y, k);
3645
  }
3646
 
3647
  //===================================== Dot ptoducts =================================
 
13542
  void quantize_row_iq3_xxs(const float * restrict x, void * restrict vy, int64_t k) {
13543
  assert(k % QK_K == 0);
13544
  block_iq3_xxs * restrict y = vy;
13545
+ quantize_row_iq3_xxs_ref(x, y, k);
13546
  }
13547
 
13548
+ void quantize_row_iq3_xxs_ref(const float * restrict x, block_iq3_xxs * restrict y, int64_t k) {
13549
  assert(k % QK_K == 0);
13550
  quantize_row_iq3_xxs_impl(256, x, y, k, NULL);
13551
  }
 
13758
  void quantize_row_iq3_s(const float * restrict x, void * restrict vy, int64_t k) {
13759
  assert(k % QK_K == 0);
13760
  block_iq3_s * restrict y = vy;
13761
+ quantize_row_iq3_s_ref(x, y, k);
13762
  }
13763
 
13764
+ void quantize_row_iq3_s_ref(const float * restrict x, block_iq3_s * restrict y, int64_t k) {
13765
  assert(k % QK_K == 0);
13766
  quantize_iq3_s(x, y, 1, k, NULL);
13767
  }
 
14499
  }
14500
  }
14501
 
14502
+ void quantize_row_iq4_nl_ref(const float * restrict x, block_iq4_nl * restrict y, int64_t k) {
14503
  assert(k % QK4_NL == 0);
14504
  quantize_row_iq4_nl(x, y, k);
14505
  }
 
14527
  void quantize_row_iq4_xs(const float * restrict x, void * restrict vy, int64_t k) {
14528
  assert(k % QK_K == 0);
14529
  block_iq4_xs * restrict y = vy;
14530
+ quantize_row_iq4_xs_ref(x, y, k);
14531
  }
14532
 
14533
+ void quantize_row_iq4_xs_ref(const float * restrict x, block_iq4_xs * restrict y, int64_t k) {
14534
  assert(k % QK_K == 0);
14535
  quantize_iq4_xs(x, y, 1, k, NULL);
14536
  }
 
14717
  return nrow * nblock * sizeof(block_iq2_s);
14718
  }
14719
 
14720
+ void quantize_row_iq2_s_ref(const float * restrict x, block_iq2_s * restrict y, int64_t k) {
14721
  assert(k % QK_K == 0);
14722
  quantize_iq2_s(x, y, 1, k, NULL);
14723
  }
 
14725
  void quantize_row_iq2_s(const float * restrict x, void * restrict vy, int64_t k) {
14726
  assert(k % QK_K == 0);
14727
  block_iq2_s * restrict y = vy;
14728
+ quantize_row_iq2_s_ref(x, y, k);
14729
  }
14730
 
14731
  static bool validate_float(float f, size_t i) {
ggml/src/ggml-quants.h CHANGED
@@ -12,25 +12,25 @@ extern "C" {
12
  #endif
13
 
14
  // Quantization
15
- void quantize_row_q4_0_reference(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k);
16
- void quantize_row_q4_1_reference(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t k);
17
- void quantize_row_q5_0_reference(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t k);
18
- void quantize_row_q5_1_reference(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t k);
19
- void quantize_row_q8_0_reference(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int64_t k);
20
- void quantize_row_q8_1_reference(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int64_t k);
21
-
22
- void quantize_row_q2_K_reference(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k);
23
- void quantize_row_q3_K_reference(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k);
24
- void quantize_row_q4_K_reference(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t k);
25
- void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t k);
26
- void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k);
27
- void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k);
28
-
29
- void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k);
30
- void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int64_t k);
31
- void quantize_row_iq4_xs_reference (const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int64_t k);
32
- void quantize_row_iq3_s_reference (const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int64_t k);
33
- void quantize_row_iq2_s_reference (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int64_t k);
34
 
35
  void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
36
  void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
 
12
  #endif
13
 
14
  // Quantization
15
+ void quantize_row_q4_0_ref(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k);
16
+ void quantize_row_q4_1_ref(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t k);
17
+ void quantize_row_q5_0_ref(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t k);
18
+ void quantize_row_q5_1_ref(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t k);
19
+ void quantize_row_q8_0_ref(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int64_t k);
20
+ void quantize_row_q8_1_ref(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int64_t k);
21
+
22
+ void quantize_row_q2_K_ref(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k);
23
+ void quantize_row_q3_K_ref(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k);
24
+ void quantize_row_q4_K_ref(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t k);
25
+ void quantize_row_q5_K_ref(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t k);
26
+ void quantize_row_q6_K_ref(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k);
27
+ void quantize_row_q8_K_ref(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k);
28
+
29
+ void quantize_row_iq3_xxs_ref(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k);
30
+ void quantize_row_iq4_nl_ref (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int64_t k);
31
+ void quantize_row_iq4_xs_ref (const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int64_t k);
32
+ void quantize_row_iq3_s_ref (const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int64_t k);
33
+ void quantize_row_iq2_s_ref (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int64_t k);
34
 
35
  void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
36
  void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
ggml/src/ggml.c CHANGED
@@ -592,7 +592,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
592
  .is_quantized = false,
593
  .to_float = (ggml_to_float_t) ggml_fp16_to_fp32_row,
594
  .from_float = (ggml_from_float_t) ggml_fp32_to_fp16_row,
595
- .from_float_reference = (ggml_from_float_t) ggml_fp32_to_fp16_row,
596
  .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16,
597
  .vec_dot_type = GGML_TYPE_F16,
598
  .nrows = 1,
@@ -604,7 +604,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
604
  .is_quantized = true,
605
  .to_float = (ggml_to_float_t) dequantize_row_q4_0,
606
  .from_float = quantize_row_q4_0,
607
- .from_float_reference = (ggml_from_float_t) quantize_row_q4_0_reference,
608
  .vec_dot = ggml_vec_dot_q4_0_q8_0,
609
  .vec_dot_type = GGML_TYPE_Q8_0,
610
  #if defined (__ARM_FEATURE_MATMUL_INT8)
@@ -620,7 +620,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
620
  .is_quantized = true,
621
  .to_float = (ggml_to_float_t) dequantize_row_q4_1,
622
  .from_float = quantize_row_q4_1,
623
- .from_float_reference = (ggml_from_float_t) quantize_row_q4_1_reference,
624
  .vec_dot = ggml_vec_dot_q4_1_q8_1,
625
  .vec_dot_type = GGML_TYPE_Q8_1,
626
  #if defined (__ARM_FEATURE_MATMUL_INT8)
@@ -636,7 +636,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
636
  .is_quantized = false,
637
  .to_float = NULL,
638
  .from_float = NULL,
639
- .from_float_reference = NULL,
640
  .vec_dot = NULL,
641
  .vec_dot_type = GGML_TYPE_COUNT,
642
  .nrows = 1,
@@ -648,7 +648,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
648
  .is_quantized = false,
649
  .to_float = NULL,
650
  .from_float = NULL,
651
- .from_float_reference = NULL,
652
  .vec_dot = NULL,
653
  .vec_dot_type = GGML_TYPE_COUNT,
654
  .nrows = 1,
@@ -660,7 +660,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
660
  .is_quantized = true,
661
  .to_float = (ggml_to_float_t) dequantize_row_q5_0,
662
  .from_float = quantize_row_q5_0,
663
- .from_float_reference = (ggml_from_float_t) quantize_row_q5_0_reference,
664
  .vec_dot = ggml_vec_dot_q5_0_q8_0,
665
  .vec_dot_type = GGML_TYPE_Q8_0,
666
  .nrows = 1,
@@ -672,7 +672,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
672
  .is_quantized = true,
673
  .to_float = (ggml_to_float_t) dequantize_row_q5_1,
674
  .from_float = quantize_row_q5_1,
675
- .from_float_reference = (ggml_from_float_t) quantize_row_q5_1_reference,
676
  .vec_dot = ggml_vec_dot_q5_1_q8_1,
677
  .vec_dot_type = GGML_TYPE_Q8_1,
678
  .nrows = 1,
@@ -684,7 +684,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
684
  .is_quantized = true,
685
  .to_float = (ggml_to_float_t) dequantize_row_q8_0,
686
  .from_float = quantize_row_q8_0,
687
- .from_float_reference = (ggml_from_float_t) quantize_row_q8_0_reference,
 
688
  .vec_dot = ggml_vec_dot_q8_0_q8_0,
689
  .vec_dot_type = GGML_TYPE_Q8_0,
690
  #if defined (__ARM_FEATURE_MATMUL_INT8)
@@ -692,7 +693,6 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
692
  #else
693
  .nrows = 1,
694
  #endif
695
- .from_float_to_mat = quantize_mat_q8_0,
696
  },
697
  [GGML_TYPE_Q8_1] = {
698
  .type_name = "q8_1",
@@ -700,7 +700,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
700
  .type_size = sizeof(block_q8_1),
701
  .is_quantized = true,
702
  .from_float = quantize_row_q8_1,
703
- .from_float_reference = (ggml_from_float_t) quantize_row_q8_1_reference,
704
  .vec_dot_type = GGML_TYPE_Q8_1,
705
  .nrows = 1,
706
  },
@@ -711,7 +711,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
711
  .is_quantized = true,
712
  .to_float = (ggml_to_float_t) dequantize_row_q2_K,
713
  .from_float = quantize_row_q2_K,
714
- .from_float_reference = (ggml_from_float_t) quantize_row_q2_K_reference,
715
  .vec_dot = ggml_vec_dot_q2_K_q8_K,
716
  .vec_dot_type = GGML_TYPE_Q8_K,
717
  .nrows = 1,
@@ -723,7 +723,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
723
  .is_quantized = true,
724
  .to_float = (ggml_to_float_t) dequantize_row_q3_K,
725
  .from_float = quantize_row_q3_K,
726
- .from_float_reference = (ggml_from_float_t) quantize_row_q3_K_reference,
727
  .vec_dot = ggml_vec_dot_q3_K_q8_K,
728
  .vec_dot_type = GGML_TYPE_Q8_K,
729
  .nrows = 1,
@@ -735,7 +735,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
735
  .is_quantized = true,
736
  .to_float = (ggml_to_float_t) dequantize_row_q4_K,
737
  .from_float = quantize_row_q4_K,
738
- .from_float_reference = (ggml_from_float_t) quantize_row_q4_K_reference,
739
  .vec_dot = ggml_vec_dot_q4_K_q8_K,
740
  .vec_dot_type = GGML_TYPE_Q8_K,
741
  .nrows = 1,
@@ -747,7 +747,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
747
  .is_quantized = true,
748
  .to_float = (ggml_to_float_t) dequantize_row_q5_K,
749
  .from_float = quantize_row_q5_K,
750
- .from_float_reference = (ggml_from_float_t) quantize_row_q5_K_reference,
751
  .vec_dot = ggml_vec_dot_q5_K_q8_K,
752
  .vec_dot_type = GGML_TYPE_Q8_K,
753
  .nrows = 1,
@@ -759,7 +759,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
759
  .is_quantized = true,
760
  .to_float = (ggml_to_float_t) dequantize_row_q6_K,
761
  .from_float = quantize_row_q6_K,
762
- .from_float_reference = (ggml_from_float_t) quantize_row_q6_K_reference,
763
  .vec_dot = ggml_vec_dot_q6_K_q8_K,
764
  .vec_dot_type = GGML_TYPE_Q8_K,
765
  .nrows = 1,
@@ -771,7 +771,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
771
  .is_quantized = true,
772
  .to_float = (ggml_to_float_t) dequantize_row_iq2_xxs,
773
  .from_float = NULL,
774
- .from_float_reference = NULL,
775
  .vec_dot = ggml_vec_dot_iq2_xxs_q8_K,
776
  .vec_dot_type = GGML_TYPE_Q8_K,
777
  .nrows = 1,
@@ -783,7 +783,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
783
  .is_quantized = true,
784
  .to_float = (ggml_to_float_t) dequantize_row_iq2_xs,
785
  .from_float = NULL,
786
- .from_float_reference = NULL,
787
  .vec_dot = ggml_vec_dot_iq2_xs_q8_K,
788
  .vec_dot_type = GGML_TYPE_Q8_K,
789
  .nrows = 1,
@@ -795,7 +795,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
795
  .is_quantized = true,
796
  .to_float = (ggml_to_float_t) dequantize_row_iq3_xxs,
797
  .from_float = quantize_row_iq3_xxs,
798
- .from_float_reference = (ggml_from_float_t)quantize_row_iq3_xxs_reference,
799
  .vec_dot = ggml_vec_dot_iq3_xxs_q8_K,
800
  .vec_dot_type = GGML_TYPE_Q8_K,
801
  .nrows = 1,
@@ -807,7 +807,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
807
  .is_quantized = true,
808
  .to_float = (ggml_to_float_t) dequantize_row_iq3_s,
809
  .from_float = quantize_row_iq3_s,
810
- .from_float_reference = (ggml_from_float_t)quantize_row_iq3_s_reference,
811
  .vec_dot = ggml_vec_dot_iq3_s_q8_K,
812
  .vec_dot_type = GGML_TYPE_Q8_K,
813
  .nrows = 1,
@@ -819,7 +819,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
819
  .is_quantized = true,
820
  .to_float = (ggml_to_float_t) dequantize_row_iq2_s,
821
  .from_float = quantize_row_iq2_s,
822
- .from_float_reference = (ggml_from_float_t)quantize_row_iq2_s_reference,
823
  .vec_dot = ggml_vec_dot_iq2_s_q8_K,
824
  .vec_dot_type = GGML_TYPE_Q8_K,
825
  .nrows = 1,
@@ -831,7 +831,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
831
  .is_quantized = true,
832
  .to_float = (ggml_to_float_t) dequantize_row_iq1_s,
833
  .from_float = NULL,
834
- .from_float_reference = NULL,
835
  .vec_dot = ggml_vec_dot_iq1_s_q8_K,
836
  .vec_dot_type = GGML_TYPE_Q8_K,
837
  .nrows = 1,
@@ -843,7 +843,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
843
  .is_quantized = true,
844
  .to_float = (ggml_to_float_t) dequantize_row_iq1_m,
845
  .from_float = NULL,
846
- .from_float_reference = NULL,
847
  .vec_dot = ggml_vec_dot_iq1_m_q8_K,
848
  .vec_dot_type = GGML_TYPE_Q8_K,
849
  .nrows = 1,
@@ -855,7 +855,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
855
  .is_quantized = true,
856
  .to_float = (ggml_to_float_t) dequantize_row_iq4_nl,
857
  .from_float = quantize_row_iq4_nl,
858
- .from_float_reference = (ggml_from_float_t)quantize_row_iq4_nl_reference,
859
  .vec_dot = ggml_vec_dot_iq4_nl_q8_0,
860
  .vec_dot_type = GGML_TYPE_Q8_0,
861
  .nrows = 1,
@@ -867,7 +867,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
867
  .is_quantized = true,
868
  .to_float = (ggml_to_float_t) dequantize_row_iq4_xs,
869
  .from_float = quantize_row_iq4_xs,
870
- .from_float_reference = (ggml_from_float_t)quantize_row_iq4_xs_reference,
871
  .vec_dot = ggml_vec_dot_iq4_xs_q8_K,
872
  .vec_dot_type = GGML_TYPE_Q8_K,
873
  .nrows = 1,
@@ -886,7 +886,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
886
  .is_quantized = false,
887
  .to_float = (ggml_to_float_t) ggml_bf16_to_fp32_row,
888
  .from_float = (ggml_from_float_t) ggml_fp32_to_bf16_row,
889
- .from_float_reference = (ggml_from_float_t) ggml_fp32_to_bf16_row,
890
  .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_bf16,
891
  .vec_dot_type = GGML_TYPE_BF16,
892
  .nrows = 1,
@@ -894,48 +894,48 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
894
  [GGML_TYPE_Q4_0_4_4] = {
895
  .type_name = "q4_0_4x4",
896
  .blck_size = QK4_0,
 
897
  .type_size = sizeof(block_q4_0),
898
  .is_quantized = true,
899
  .to_float = NULL,
900
  .from_float = NULL,
901
- .from_float_reference = NULL,
902
  .vec_dot = NULL,
903
  .vec_dot_type = GGML_TYPE_Q8_0,
904
  .nrows = 1,
905
  .ncols = 4,
906
- .interleave_blcksize = 4,
907
  .gemv = ggml_gemv_q4_0_4x4_q8_0,
908
  .gemm = ggml_gemm_q4_0_4x4_q8_0,
909
  },
910
  [GGML_TYPE_Q4_0_4_8] = {
911
  .type_name = "q4_0_4x8",
912
  .blck_size = QK4_0,
 
913
  .type_size = sizeof(block_q4_0),
914
  .is_quantized = true,
915
  .to_float = NULL,
916
  .from_float = NULL,
917
- .from_float_reference = NULL,
918
  .vec_dot = NULL,
919
  .vec_dot_type = GGML_TYPE_Q8_0,
920
  .nrows = 1,
921
  .ncols = 4,
922
- .interleave_blcksize = 8,
923
  .gemv = ggml_gemv_q4_0_4x8_q8_0,
924
  .gemm = ggml_gemm_q4_0_4x8_q8_0,
925
  },
926
  [GGML_TYPE_Q4_0_8_8] = {
927
  .type_name = "q4_0_8x8",
928
  .blck_size = QK4_0,
 
929
  .type_size = sizeof(block_q4_0),
930
  .is_quantized = true,
931
  .to_float = NULL,
932
  .from_float = NULL,
933
- .from_float_reference = NULL,
934
  .vec_dot = NULL,
935
  .vec_dot_type = GGML_TYPE_Q8_0,
936
  .nrows = 1,
937
  .ncols = 8,
938
- .interleave_blcksize = 8,
939
  .gemv = ggml_gemv_q4_0_8x8_q8_0,
940
  .gemm = ggml_gemm_q4_0_8x8_q8_0,
941
  }
@@ -3115,7 +3115,7 @@ size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
3115
  return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
3116
  }
3117
 
3118
- GGML_CALL int ggml_blck_size(enum ggml_type type) {
3119
  return type_traits[type].blck_size;
3120
  }
3121
 
@@ -12192,15 +12192,14 @@ static void ggml_compute_forward_mul_mat(
12192
 
12193
  const enum ggml_type type = src0->type;
12194
 
12195
- enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
12196
- ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
12197
- int64_t const vec_dot_num_rows = type_traits[type].nrows;
12198
- int64_t const matmul_num_cols = type_traits[type].ncols;
12199
- int64_t const interleave_blcksize = type_traits[type].interleave_blcksize;
12200
- ggml_from_float_to_mat_t const from_float_to_mat
12201
- = type_traits[vec_dot_type].from_float_to_mat;
12202
- ggml_gemv_t const gemv = type_traits[type].gemv;
12203
- ggml_gemm_t const gemm = type_traits[type].gemm;
12204
 
12205
  GGML_ASSERT(ne0 == ne01);
12206
  GGML_ASSERT(ne1 == ne11);
@@ -12264,14 +12263,14 @@ UseGgmlGemm1:;
12264
  for (int64_t i11 = ith * 4; i11 < ne11 - ne11 % 4; i11 += nth * 4) {
12265
  from_float_to_mat((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
12266
  (void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
12267
- 4, ne10, interleave_blcksize);
12268
  }
12269
  i11_processed = ne11 - ne11 % 4;
12270
  }
12271
  for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) {
12272
- from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
12273
- (void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
12274
- ne10);
12275
  }
12276
  }
12277
  }
@@ -12355,7 +12354,7 @@ UseGgmlGemm2:;
12355
  int64_t src0_start = (ith * ne01) / nth;
12356
  int64_t src0_end = ((ith + 1) * ne01) / nth;
12357
  src0_start = (src0_start % matmul_num_cols) ? src0_start + matmul_num_cols - (src0_start % matmul_num_cols): src0_start;
12358
- src0_end = (src0_end % matmul_num_cols) ? src0_end + matmul_num_cols - (src0_end % matmul_num_cols): src0_end;
12359
  if (src0_start >= src0_end) return;
12360
 
12361
  // If there are more than three rows in src1, use gemm; otherwise, use gemv.
@@ -12413,11 +12412,11 @@ static void ggml_compute_forward_mul_mat_id(
12413
 
12414
  const bool src1_cont = ggml_is_contiguous(src1);
12415
 
12416
- ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
12417
- enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
12418
- ggml_from_float_t const from_float_to_vec_dot = type_traits[vec_dot_type].from_float;
12419
- int64_t const matmul_num_cols = type_traits[type].ncols;
12420
- ggml_gemv_t const gemv = type_traits[type].gemv;
12421
 
12422
  // we don't support permuted src0 or src1
12423
  GGML_ASSERT(nb00 == ggml_type_size(type));
@@ -12458,9 +12457,9 @@ static void ggml_compute_forward_mul_mat_id(
12458
  for (int64_t i13 = 0; i13 < ne13; ++i13) {
12459
  for (int64_t i12 = 0; i12 < ne12; ++i12) {
12460
  for (int64_t i11 = ith; i11 < ne11; i11 += nth) {
12461
- from_float_to_vec_dot((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
12462
- (void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
12463
- ne10);
12464
  }
12465
  }
12466
  }
@@ -21062,8 +21061,8 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
21062
  (int64_t) info->ne[3];
21063
 
21064
  if (ne % ggml_blck_size(info->type) != 0) {
21065
- fprintf(stderr, "%s: tensor '%s' of type %d (%s) number of elements (%" PRId64 ") is not a multiple of block size (%d)\n",
21066
- __func__, info->name.data, (int)info->type, ggml_type_name(info->type), ne, ggml_blck_size(info->type));
21067
  fclose(file);
21068
  gguf_free(ctx);
21069
  return NULL;
 
592
  .is_quantized = false,
593
  .to_float = (ggml_to_float_t) ggml_fp16_to_fp32_row,
594
  .from_float = (ggml_from_float_t) ggml_fp32_to_fp16_row,
595
+ .from_float_ref = (ggml_from_float_t) ggml_fp32_to_fp16_row,
596
  .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16,
597
  .vec_dot_type = GGML_TYPE_F16,
598
  .nrows = 1,
 
604
  .is_quantized = true,
605
  .to_float = (ggml_to_float_t) dequantize_row_q4_0,
606
  .from_float = quantize_row_q4_0,
607
+ .from_float_ref = (ggml_from_float_t) quantize_row_q4_0_ref,
608
  .vec_dot = ggml_vec_dot_q4_0_q8_0,
609
  .vec_dot_type = GGML_TYPE_Q8_0,
610
  #if defined (__ARM_FEATURE_MATMUL_INT8)
 
620
  .is_quantized = true,
621
  .to_float = (ggml_to_float_t) dequantize_row_q4_1,
622
  .from_float = quantize_row_q4_1,
623
+ .from_float_ref = (ggml_from_float_t) quantize_row_q4_1_ref,
624
  .vec_dot = ggml_vec_dot_q4_1_q8_1,
625
  .vec_dot_type = GGML_TYPE_Q8_1,
626
  #if defined (__ARM_FEATURE_MATMUL_INT8)
 
636
  .is_quantized = false,
637
  .to_float = NULL,
638
  .from_float = NULL,
639
+ .from_float_ref = NULL,
640
  .vec_dot = NULL,
641
  .vec_dot_type = GGML_TYPE_COUNT,
642
  .nrows = 1,
 
648
  .is_quantized = false,
649
  .to_float = NULL,
650
  .from_float = NULL,
651
+ .from_float_ref = NULL,
652
  .vec_dot = NULL,
653
  .vec_dot_type = GGML_TYPE_COUNT,
654
  .nrows = 1,
 
660
  .is_quantized = true,
661
  .to_float = (ggml_to_float_t) dequantize_row_q5_0,
662
  .from_float = quantize_row_q5_0,
663
+ .from_float_ref = (ggml_from_float_t) quantize_row_q5_0_ref,
664
  .vec_dot = ggml_vec_dot_q5_0_q8_0,
665
  .vec_dot_type = GGML_TYPE_Q8_0,
666
  .nrows = 1,
 
672
  .is_quantized = true,
673
  .to_float = (ggml_to_float_t) dequantize_row_q5_1,
674
  .from_float = quantize_row_q5_1,
675
+ .from_float_ref = (ggml_from_float_t) quantize_row_q5_1_ref,
676
  .vec_dot = ggml_vec_dot_q5_1_q8_1,
677
  .vec_dot_type = GGML_TYPE_Q8_1,
678
  .nrows = 1,
 
684
  .is_quantized = true,
685
  .to_float = (ggml_to_float_t) dequantize_row_q8_0,
686
  .from_float = quantize_row_q8_0,
687
+ .from_float_ref = (ggml_from_float_t) quantize_row_q8_0_ref,
688
+ .from_float_to_mat = quantize_mat_q8_0,
689
  .vec_dot = ggml_vec_dot_q8_0_q8_0,
690
  .vec_dot_type = GGML_TYPE_Q8_0,
691
  #if defined (__ARM_FEATURE_MATMUL_INT8)
 
693
  #else
694
  .nrows = 1,
695
  #endif
 
696
  },
697
  [GGML_TYPE_Q8_1] = {
698
  .type_name = "q8_1",
 
700
  .type_size = sizeof(block_q8_1),
701
  .is_quantized = true,
702
  .from_float = quantize_row_q8_1,
703
+ .from_float_ref = (ggml_from_float_t) quantize_row_q8_1_ref,
704
  .vec_dot_type = GGML_TYPE_Q8_1,
705
  .nrows = 1,
706
  },
 
711
  .is_quantized = true,
712
  .to_float = (ggml_to_float_t) dequantize_row_q2_K,
713
  .from_float = quantize_row_q2_K,
714
+ .from_float_ref = (ggml_from_float_t) quantize_row_q2_K_ref,
715
  .vec_dot = ggml_vec_dot_q2_K_q8_K,
716
  .vec_dot_type = GGML_TYPE_Q8_K,
717
  .nrows = 1,
 
723
  .is_quantized = true,
724
  .to_float = (ggml_to_float_t) dequantize_row_q3_K,
725
  .from_float = quantize_row_q3_K,
726
+ .from_float_ref = (ggml_from_float_t) quantize_row_q3_K_ref,
727
  .vec_dot = ggml_vec_dot_q3_K_q8_K,
728
  .vec_dot_type = GGML_TYPE_Q8_K,
729
  .nrows = 1,
 
735
  .is_quantized = true,
736
  .to_float = (ggml_to_float_t) dequantize_row_q4_K,
737
  .from_float = quantize_row_q4_K,
738
+ .from_float_ref = (ggml_from_float_t) quantize_row_q4_K_ref,
739
  .vec_dot = ggml_vec_dot_q4_K_q8_K,
740
  .vec_dot_type = GGML_TYPE_Q8_K,
741
  .nrows = 1,
 
747
  .is_quantized = true,
748
  .to_float = (ggml_to_float_t) dequantize_row_q5_K,
749
  .from_float = quantize_row_q5_K,
750
+ .from_float_ref = (ggml_from_float_t) quantize_row_q5_K_ref,
751
  .vec_dot = ggml_vec_dot_q5_K_q8_K,
752
  .vec_dot_type = GGML_TYPE_Q8_K,
753
  .nrows = 1,
 
759
  .is_quantized = true,
760
  .to_float = (ggml_to_float_t) dequantize_row_q6_K,
761
  .from_float = quantize_row_q6_K,
762
+ .from_float_ref = (ggml_from_float_t) quantize_row_q6_K_ref,
763
  .vec_dot = ggml_vec_dot_q6_K_q8_K,
764
  .vec_dot_type = GGML_TYPE_Q8_K,
765
  .nrows = 1,
 
771
  .is_quantized = true,
772
  .to_float = (ggml_to_float_t) dequantize_row_iq2_xxs,
773
  .from_float = NULL,
774
+ .from_float_ref = NULL,
775
  .vec_dot = ggml_vec_dot_iq2_xxs_q8_K,
776
  .vec_dot_type = GGML_TYPE_Q8_K,
777
  .nrows = 1,
 
783
  .is_quantized = true,
784
  .to_float = (ggml_to_float_t) dequantize_row_iq2_xs,
785
  .from_float = NULL,
786
+ .from_float_ref = NULL,
787
  .vec_dot = ggml_vec_dot_iq2_xs_q8_K,
788
  .vec_dot_type = GGML_TYPE_Q8_K,
789
  .nrows = 1,
 
795
  .is_quantized = true,
796
  .to_float = (ggml_to_float_t) dequantize_row_iq3_xxs,
797
  .from_float = quantize_row_iq3_xxs,
798
+ .from_float_ref = (ggml_from_float_t)quantize_row_iq3_xxs_ref,
799
  .vec_dot = ggml_vec_dot_iq3_xxs_q8_K,
800
  .vec_dot_type = GGML_TYPE_Q8_K,
801
  .nrows = 1,
 
807
  .is_quantized = true,
808
  .to_float = (ggml_to_float_t) dequantize_row_iq3_s,
809
  .from_float = quantize_row_iq3_s,
810
+ .from_float_ref = (ggml_from_float_t)quantize_row_iq3_s_ref,
811
  .vec_dot = ggml_vec_dot_iq3_s_q8_K,
812
  .vec_dot_type = GGML_TYPE_Q8_K,
813
  .nrows = 1,
 
819
  .is_quantized = true,
820
  .to_float = (ggml_to_float_t) dequantize_row_iq2_s,
821
  .from_float = quantize_row_iq2_s,
822
+ .from_float_ref = (ggml_from_float_t)quantize_row_iq2_s_ref,
823
  .vec_dot = ggml_vec_dot_iq2_s_q8_K,
824
  .vec_dot_type = GGML_TYPE_Q8_K,
825
  .nrows = 1,
 
831
  .is_quantized = true,
832
  .to_float = (ggml_to_float_t) dequantize_row_iq1_s,
833
  .from_float = NULL,
834
+ .from_float_ref = NULL,
835
  .vec_dot = ggml_vec_dot_iq1_s_q8_K,
836
  .vec_dot_type = GGML_TYPE_Q8_K,
837
  .nrows = 1,
 
843
  .is_quantized = true,
844
  .to_float = (ggml_to_float_t) dequantize_row_iq1_m,
845
  .from_float = NULL,
846
+ .from_float_ref = NULL,
847
  .vec_dot = ggml_vec_dot_iq1_m_q8_K,
848
  .vec_dot_type = GGML_TYPE_Q8_K,
849
  .nrows = 1,
 
855
  .is_quantized = true,
856
  .to_float = (ggml_to_float_t) dequantize_row_iq4_nl,
857
  .from_float = quantize_row_iq4_nl,
858
+ .from_float_ref = (ggml_from_float_t)quantize_row_iq4_nl_ref,
859
  .vec_dot = ggml_vec_dot_iq4_nl_q8_0,
860
  .vec_dot_type = GGML_TYPE_Q8_0,
861
  .nrows = 1,
 
867
  .is_quantized = true,
868
  .to_float = (ggml_to_float_t) dequantize_row_iq4_xs,
869
  .from_float = quantize_row_iq4_xs,
870
+ .from_float_ref = (ggml_from_float_t)quantize_row_iq4_xs_ref,
871
  .vec_dot = ggml_vec_dot_iq4_xs_q8_K,
872
  .vec_dot_type = GGML_TYPE_Q8_K,
873
  .nrows = 1,
 
886
  .is_quantized = false,
887
  .to_float = (ggml_to_float_t) ggml_bf16_to_fp32_row,
888
  .from_float = (ggml_from_float_t) ggml_fp32_to_bf16_row,
889
+ .from_float_ref = (ggml_from_float_t) ggml_fp32_to_bf16_row,
890
  .vec_dot = (ggml_vec_dot_t) ggml_vec_dot_bf16,
891
  .vec_dot_type = GGML_TYPE_BF16,
892
  .nrows = 1,
 
894
  [GGML_TYPE_Q4_0_4_4] = {
895
  .type_name = "q4_0_4x4",
896
  .blck_size = QK4_0,
897
+ .blck_size_interleave = 4,
898
  .type_size = sizeof(block_q4_0),
899
  .is_quantized = true,
900
  .to_float = NULL,
901
  .from_float = NULL,
902
+ .from_float_ref = NULL,
903
  .vec_dot = NULL,
904
  .vec_dot_type = GGML_TYPE_Q8_0,
905
  .nrows = 1,
906
  .ncols = 4,
 
907
  .gemv = ggml_gemv_q4_0_4x4_q8_0,
908
  .gemm = ggml_gemm_q4_0_4x4_q8_0,
909
  },
910
  [GGML_TYPE_Q4_0_4_8] = {
911
  .type_name = "q4_0_4x8",
912
  .blck_size = QK4_0,
913
+ .blck_size_interleave = 8,
914
  .type_size = sizeof(block_q4_0),
915
  .is_quantized = true,
916
  .to_float = NULL,
917
  .from_float = NULL,
918
+ .from_float_ref = NULL,
919
  .vec_dot = NULL,
920
  .vec_dot_type = GGML_TYPE_Q8_0,
921
  .nrows = 1,
922
  .ncols = 4,
 
923
  .gemv = ggml_gemv_q4_0_4x8_q8_0,
924
  .gemm = ggml_gemm_q4_0_4x8_q8_0,
925
  },
926
  [GGML_TYPE_Q4_0_8_8] = {
927
  .type_name = "q4_0_8x8",
928
  .blck_size = QK4_0,
929
+ .blck_size_interleave = 8,
930
  .type_size = sizeof(block_q4_0),
931
  .is_quantized = true,
932
  .to_float = NULL,
933
  .from_float = NULL,
934
+ .from_float_ref = NULL,
935
  .vec_dot = NULL,
936
  .vec_dot_type = GGML_TYPE_Q8_0,
937
  .nrows = 1,
938
  .ncols = 8,
 
939
  .gemv = ggml_gemv_q4_0_8x8_q8_0,
940
  .gemm = ggml_gemm_q4_0_8x8_q8_0,
941
  }
 
3115
  return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
3116
  }
3117
 
3118
+ GGML_CALL int64_t ggml_blck_size(enum ggml_type type) {
3119
  return type_traits[type].blck_size;
3120
  }
3121
 
 
12192
 
12193
  const enum ggml_type type = src0->type;
12194
 
12195
+ enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
12196
+ ggml_from_float_t const from_float = type_traits[vec_dot_type].from_float;
12197
+ ggml_from_float_to_mat_t const from_float_to_mat = type_traits[vec_dot_type].from_float_to_mat;
12198
+ int64_t const vec_dot_num_rows = type_traits[type].nrows;
12199
+ int64_t const matmul_num_cols = type_traits[type].ncols;
12200
+ int64_t const blck_size_interleave = type_traits[type].blck_size_interleave;
12201
+ ggml_gemv_t const gemv = type_traits[type].gemv;
12202
+ ggml_gemm_t const gemm = type_traits[type].gemm;
 
12203
 
12204
  GGML_ASSERT(ne0 == ne01);
12205
  GGML_ASSERT(ne1 == ne11);
 
12263
  for (int64_t i11 = ith * 4; i11 < ne11 - ne11 % 4; i11 += nth * 4) {
12264
  from_float_to_mat((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
12265
  (void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
12266
+ 4, ne10, blck_size_interleave);
12267
  }
12268
  i11_processed = ne11 - ne11 % 4;
12269
  }
12270
  for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) {
12271
+ from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
12272
+ (void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
12273
+ ne10);
12274
  }
12275
  }
12276
  }
 
12354
  int64_t src0_start = (ith * ne01) / nth;
12355
  int64_t src0_end = ((ith + 1) * ne01) / nth;
12356
  src0_start = (src0_start % matmul_num_cols) ? src0_start + matmul_num_cols - (src0_start % matmul_num_cols): src0_start;
12357
+ src0_end = (src0_end % matmul_num_cols) ? src0_end + matmul_num_cols - (src0_end % matmul_num_cols): src0_end;
12358
  if (src0_start >= src0_end) return;
12359
 
12360
  // If there are more than three rows in src1, use gemm; otherwise, use gemv.
 
12412
 
12413
  const bool src1_cont = ggml_is_contiguous(src1);
12414
 
12415
+ ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
12416
+ enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
12417
+ ggml_from_float_t const from_float = type_traits[vec_dot_type].from_float;
12418
+ int64_t const matmul_num_cols = type_traits[type].ncols;
12419
+ ggml_gemv_t const gemv = type_traits[type].gemv;
12420
 
12421
  // we don't support permuted src0 or src1
12422
  GGML_ASSERT(nb00 == ggml_type_size(type));
 
12457
  for (int64_t i13 = 0; i13 < ne13; ++i13) {
12458
  for (int64_t i12 = 0; i12 < ne12; ++i12) {
12459
  for (int64_t i11 = ith; i11 < ne11; i11 += nth) {
12460
+ from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
12461
+ (void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
12462
+ ne10);
12463
  }
12464
  }
12465
  }
 
21061
  (int64_t) info->ne[3];
21062
 
21063
  if (ne % ggml_blck_size(info->type) != 0) {
21064
+ fprintf(stderr, "%s: tensor '%s' of type %d (%s) number of elements (%" PRId64 ") is not a multiple of block size (%" PRId64 ")\n",
21065
+ __func__, info->name.data, (int) info->type, ggml_type_name(info->type), ne, ggml_blck_size(info->type));
21066
  fclose(file);
21067
  gguf_free(ctx);
21068
  return NULL;