Spaces:
Running
Running
ggml : minor naming changes (llama/8433)
Browse files* ggml : minor naming changes
ggml-ci
* ggml : use PRId64 [no ci]
* ggml : revert FA K/Q names
- ggml/include/ggml.h +25 -25
- ggml/src/ggml-quants.c +47 -47
- ggml/src/ggml-quants.h +19 -19
- ggml/src/ggml.c +55 -56
ggml/include/ggml.h
CHANGED
|
@@ -714,9 +714,9 @@ extern "C" {
|
|
| 714 |
GGML_API GGML_CALL size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
| 715 |
GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
|
| 716 |
|
| 717 |
-
GGML_API GGML_CALL
|
| 718 |
-
GGML_API GGML_CALL size_t
|
| 719 |
-
GGML_API GGML_CALL size_t
|
| 720 |
|
| 721 |
GGML_DEPRECATED(
|
| 722 |
GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
|
|
@@ -2410,31 +2410,31 @@ extern "C" {
|
|
| 2410 |
#endif
|
| 2411 |
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
| 2412 |
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
| 2413 |
-
typedef void (*
|
| 2414 |
-
|
| 2415 |
-
typedef void (*
|
| 2416 |
-
|
| 2417 |
-
typedef void (*ggml_gemv_t)
|
| 2418 |
-
|
| 2419 |
-
typedef void (*ggml_gemm_t)
|
| 2420 |
-
|
| 2421 |
|
| 2422 |
typedef struct {
|
| 2423 |
-
const char
|
| 2424 |
-
|
| 2425 |
-
|
| 2426 |
-
|
| 2427 |
-
|
| 2428 |
-
|
| 2429 |
-
ggml_from_float_t
|
| 2430 |
-
|
| 2431 |
-
enum ggml_type vec_dot_type;
|
| 2432 |
-
int64_t nrows; // number of rows to process simultaneously;
|
| 2433 |
-
int64_t ncols; // number of columns to process simultaneously;
|
| 2434 |
-
int64_t interleave_blcksize; // interleave elements in blocks of interleave_blcksize;
|
| 2435 |
ggml_from_float_to_mat_t from_float_to_mat;
|
| 2436 |
-
|
| 2437 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2438 |
} ggml_type_traits_t;
|
| 2439 |
|
| 2440 |
GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
|
|
|
|
| 714 |
GGML_API GGML_CALL size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
| 715 |
GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
|
| 716 |
|
| 717 |
+
GGML_API GGML_CALL int64_t ggml_blck_size(enum ggml_type type);
|
| 718 |
+
GGML_API GGML_CALL size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
|
| 719 |
+
GGML_API GGML_CALL size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
|
| 720 |
|
| 721 |
GGML_DEPRECATED(
|
| 722 |
GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
|
|
|
|
| 2410 |
#endif
|
| 2411 |
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
|
| 2412 |
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
| 2413 |
+
typedef void (*ggml_from_float_to_mat_t)
|
| 2414 |
+
(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nr, int64_t k, int64_t bs);
|
| 2415 |
+
typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
|
| 2416 |
+
const void * GGML_RESTRICT y, size_t by, int nrc);
|
| 2417 |
+
typedef void (*ggml_gemv_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
|
| 2418 |
+
const void * GGML_RESTRICT y, int nr, int nc);
|
| 2419 |
+
typedef void (*ggml_gemm_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
|
| 2420 |
+
const void * GGML_RESTRICT y, int nr, int nc);
|
| 2421 |
|
| 2422 |
typedef struct {
|
| 2423 |
+
const char * type_name;
|
| 2424 |
+
int64_t blck_size;
|
| 2425 |
+
int64_t blck_size_interleave; // interleave elements in blocks
|
| 2426 |
+
size_t type_size;
|
| 2427 |
+
bool is_quantized;
|
| 2428 |
+
ggml_to_float_t to_float;
|
| 2429 |
+
ggml_from_float_t from_float;
|
| 2430 |
+
ggml_from_float_t from_float_ref;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2431 |
ggml_from_float_to_mat_t from_float_to_mat;
|
| 2432 |
+
ggml_vec_dot_t vec_dot;
|
| 2433 |
+
enum ggml_type vec_dot_type;
|
| 2434 |
+
int64_t nrows; // number of rows to process simultaneously
|
| 2435 |
+
int64_t ncols; // number of columns to process simultaneously
|
| 2436 |
+
ggml_gemv_t gemv;
|
| 2437 |
+
ggml_gemm_t gemm;
|
| 2438 |
} ggml_type_traits_t;
|
| 2439 |
|
| 2440 |
GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
|
ggml/src/ggml-quants.c
CHANGED
|
@@ -658,7 +658,7 @@ static inline __m128i packNibbles( __m256i bytes ) {
|
|
| 658 |
#endif //__loongarch_asx
|
| 659 |
|
| 660 |
// reference implementation for deterministic creation of model files
|
| 661 |
-
void
|
| 662 |
static const int qk = QK4_0;
|
| 663 |
|
| 664 |
assert(k % qk == 0);
|
|
@@ -696,11 +696,11 @@ void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict
|
|
| 696 |
}
|
| 697 |
|
| 698 |
void quantize_row_q4_0(const float * restrict x, void * restrict y, int64_t k) {
|
| 699 |
-
|
| 700 |
}
|
| 701 |
|
| 702 |
|
| 703 |
-
void
|
| 704 |
const int qk = QK4_1;
|
| 705 |
|
| 706 |
assert(k % qk == 0);
|
|
@@ -738,10 +738,10 @@ void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict
|
|
| 738 |
}
|
| 739 |
|
| 740 |
void quantize_row_q4_1(const float * restrict x, void * restrict y, int64_t k) {
|
| 741 |
-
|
| 742 |
}
|
| 743 |
|
| 744 |
-
void
|
| 745 |
static const int qk = QK5_0;
|
| 746 |
|
| 747 |
assert(k % qk == 0);
|
|
@@ -786,10 +786,10 @@ void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict
|
|
| 786 |
}
|
| 787 |
|
| 788 |
void quantize_row_q5_0(const float * restrict x, void * restrict y, int64_t k) {
|
| 789 |
-
|
| 790 |
}
|
| 791 |
|
| 792 |
-
void
|
| 793 |
const int qk = QK5_1;
|
| 794 |
|
| 795 |
assert(k % qk == 0);
|
|
@@ -834,11 +834,11 @@ void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict
|
|
| 834 |
}
|
| 835 |
|
| 836 |
void quantize_row_q5_1(const float * restrict x, void * restrict y, int64_t k) {
|
| 837 |
-
|
| 838 |
}
|
| 839 |
|
| 840 |
// reference implementation for deterministic creation of model files
|
| 841 |
-
void
|
| 842 |
assert(k % QK8_0 == 0);
|
| 843 |
const int nb = k / QK8_0;
|
| 844 |
|
|
@@ -1144,12 +1144,12 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
|
|
| 1144 |
#else
|
| 1145 |
GGML_UNUSED(nb);
|
| 1146 |
// scalar
|
| 1147 |
-
|
| 1148 |
#endif
|
| 1149 |
}
|
| 1150 |
|
| 1151 |
// reference implementation for deterministic creation of model files
|
| 1152 |
-
void
|
| 1153 |
assert(QK8_1 == 32);
|
| 1154 |
assert(k % QK8_1 == 0);
|
| 1155 |
const int nb = k / QK8_1;
|
|
@@ -1508,7 +1508,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k)
|
|
| 1508 |
#else
|
| 1509 |
GGML_UNUSED(nb);
|
| 1510 |
// scalar
|
| 1511 |
-
|
| 1512 |
#endif
|
| 1513 |
}
|
| 1514 |
|
|
@@ -1899,7 +1899,7 @@ static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t *
|
|
| 1899 |
|
| 1900 |
//========================- 2-bit (de)-quantization
|
| 1901 |
|
| 1902 |
-
void
|
| 1903 |
assert(k % QK_K == 0);
|
| 1904 |
const int nb = k / QK_K;
|
| 1905 |
|
|
@@ -2002,7 +2002,7 @@ void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int6
|
|
| 2002 |
}
|
| 2003 |
|
| 2004 |
void quantize_row_q2_K(const float * restrict x, void * restrict vy, int64_t k) {
|
| 2005 |
-
|
| 2006 |
}
|
| 2007 |
|
| 2008 |
static float make_qkx3_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
|
|
@@ -2226,7 +2226,7 @@ static void quantize_row_q2_K_impl(const float * restrict x, block_q2_K * restri
|
|
| 2226 |
size_t quantize_q2_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
| 2227 |
size_t row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row);
|
| 2228 |
if (!quant_weights) {
|
| 2229 |
-
|
| 2230 |
}
|
| 2231 |
else {
|
| 2232 |
char * qrow = (char *)dst;
|
|
@@ -2241,7 +2241,7 @@ size_t quantize_q2_K(const float * restrict src, void * restrict dst, int64_t nr
|
|
| 2241 |
|
| 2242 |
//========================= 3-bit (de)-quantization
|
| 2243 |
|
| 2244 |
-
void
|
| 2245 |
assert(k % QK_K == 0);
|
| 2246 |
const int nb = k / QK_K;
|
| 2247 |
|
|
@@ -2368,7 +2368,7 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int6
|
|
| 2368 |
}
|
| 2369 |
|
| 2370 |
void quantize_row_q3_K(const float * restrict x, void * restrict vy, int64_t k) {
|
| 2371 |
-
|
| 2372 |
}
|
| 2373 |
|
| 2374 |
static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restrict y, int64_t n_per_row, const float * restrict quant_weights) {
|
|
@@ -2458,7 +2458,7 @@ static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restri
|
|
| 2458 |
size_t quantize_q3_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
| 2459 |
size_t row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row);
|
| 2460 |
if (!quant_weights) {
|
| 2461 |
-
|
| 2462 |
}
|
| 2463 |
else {
|
| 2464 |
char * qrow = (char *)dst;
|
|
@@ -2473,7 +2473,7 @@ size_t quantize_q3_K(const float * restrict src, void * restrict dst, int64_t nr
|
|
| 2473 |
|
| 2474 |
// ====================== 4-bit (de)-quantization
|
| 2475 |
|
| 2476 |
-
void
|
| 2477 |
assert(k % QK_K == 0);
|
| 2478 |
const int nb = k / QK_K;
|
| 2479 |
|
|
@@ -2572,7 +2572,7 @@ void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int6
|
|
| 2572 |
void quantize_row_q4_K(const float * restrict x, void * restrict vy, int64_t k) {
|
| 2573 |
assert(k % QK_K == 0);
|
| 2574 |
block_q4_K * restrict y = vy;
|
| 2575 |
-
|
| 2576 |
}
|
| 2577 |
|
| 2578 |
static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restrict y, int64_t n_per_row, const float * quant_weights) {
|
|
@@ -2651,7 +2651,7 @@ static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restri
|
|
| 2651 |
size_t quantize_q4_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
| 2652 |
size_t row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row);
|
| 2653 |
if (!quant_weights) {
|
| 2654 |
-
|
| 2655 |
}
|
| 2656 |
else {
|
| 2657 |
char * qrow = (char *)dst;
|
|
@@ -2666,7 +2666,7 @@ size_t quantize_q4_K(const float * restrict src, void * restrict dst, int64_t nr
|
|
| 2666 |
|
| 2667 |
// ====================== 5-bit (de)-quantization
|
| 2668 |
|
| 2669 |
-
void
|
| 2670 |
assert(k % QK_K == 0);
|
| 2671 |
const int64_t nb = k / QK_K;
|
| 2672 |
|
|
@@ -2783,7 +2783,7 @@ void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int6
|
|
| 2783 |
void quantize_row_q5_K(const float * restrict x, void * restrict vy, int64_t k) {
|
| 2784 |
assert(k % QK_K == 0);
|
| 2785 |
block_q5_K * restrict y = vy;
|
| 2786 |
-
|
| 2787 |
}
|
| 2788 |
|
| 2789 |
static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restrict y, int64_t n_per_row, const float * quant_weights) {
|
|
@@ -2882,7 +2882,7 @@ static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restri
|
|
| 2882 |
size_t quantize_q5_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
| 2883 |
size_t row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row);
|
| 2884 |
if (!quant_weights) {
|
| 2885 |
-
|
| 2886 |
}
|
| 2887 |
else {
|
| 2888 |
char * qrow = (char *)dst;
|
|
@@ -2897,7 +2897,7 @@ size_t quantize_q5_K(const float * restrict src, void * restrict dst, int64_t nr
|
|
| 2897 |
|
| 2898 |
// ====================== 6-bit (de)-quantization
|
| 2899 |
|
| 2900 |
-
void
|
| 2901 |
assert(k % QK_K == 0);
|
| 2902 |
const int64_t nb = k / QK_K;
|
| 2903 |
|
|
@@ -3001,7 +3001,7 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int6
|
|
| 3001 |
void quantize_row_q6_K(const float * restrict x, void * restrict vy, int64_t k) {
|
| 3002 |
assert(k % QK_K == 0);
|
| 3003 |
block_q6_K * restrict y = vy;
|
| 3004 |
-
|
| 3005 |
}
|
| 3006 |
|
| 3007 |
static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restrict y, int64_t n_per_row, const float * quant_weights) {
|
|
@@ -3091,7 +3091,7 @@ static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restri
|
|
| 3091 |
size_t quantize_q6_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
| 3092 |
size_t row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row);
|
| 3093 |
if (!quant_weights) {
|
| 3094 |
-
|
| 3095 |
}
|
| 3096 |
else {
|
| 3097 |
char * qrow = (char *)dst;
|
|
@@ -3108,7 +3108,7 @@ static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restri
|
|
| 3108 |
static_assert(QK4_0 == 32, "QK4_0 must be 32");
|
| 3109 |
|
| 3110 |
if (!quant_weights) {
|
| 3111 |
-
|
| 3112 |
return;
|
| 3113 |
}
|
| 3114 |
|
|
@@ -3134,7 +3134,7 @@ static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restri
|
|
| 3134 |
|
| 3135 |
size_t quantize_q4_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
| 3136 |
if (!quant_weights) {
|
| 3137 |
-
|
| 3138 |
return nrow * ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
|
| 3139 |
}
|
| 3140 |
size_t row_size = ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
|
|
@@ -3151,7 +3151,7 @@ static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restri
|
|
| 3151 |
static_assert(QK4_1 == 32, "QK4_1 must be 32");
|
| 3152 |
|
| 3153 |
if (!quant_weights) {
|
| 3154 |
-
|
| 3155 |
return;
|
| 3156 |
}
|
| 3157 |
|
|
@@ -3179,7 +3179,7 @@ static void quantize_row_q4_1_impl(const float * restrict x, block_q4_1 * restri
|
|
| 3179 |
|
| 3180 |
size_t quantize_q4_1(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
| 3181 |
if (!quant_weights) {
|
| 3182 |
-
|
| 3183 |
return nrow * ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
|
| 3184 |
}
|
| 3185 |
size_t row_size = ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
|
|
@@ -3196,7 +3196,7 @@ static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restri
|
|
| 3196 |
static_assert(QK5_0 == 32, "QK5_0 must be 32");
|
| 3197 |
|
| 3198 |
if (!quant_weights) {
|
| 3199 |
-
|
| 3200 |
return;
|
| 3201 |
}
|
| 3202 |
|
|
@@ -3233,7 +3233,7 @@ static void quantize_row_q5_0_impl(const float * restrict x, block_q5_0 * restri
|
|
| 3233 |
|
| 3234 |
size_t quantize_q5_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
| 3235 |
if (!quant_weights) {
|
| 3236 |
-
|
| 3237 |
return nrow * ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
|
| 3238 |
}
|
| 3239 |
size_t row_size = ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
|
|
@@ -3250,7 +3250,7 @@ static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restri
|
|
| 3250 |
static_assert(QK5_1 == 32, "QK5_1 must be 32");
|
| 3251 |
|
| 3252 |
if (!quant_weights) {
|
| 3253 |
-
|
| 3254 |
return;
|
| 3255 |
}
|
| 3256 |
|
|
@@ -3286,7 +3286,7 @@ static void quantize_row_q5_1_impl(const float * restrict x, block_q5_1 * restri
|
|
| 3286 |
|
| 3287 |
size_t quantize_q5_1(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
| 3288 |
if (!quant_weights) {
|
| 3289 |
-
|
| 3290 |
return nrow * ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
|
| 3291 |
}
|
| 3292 |
size_t row_size = ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
|
|
@@ -3302,7 +3302,7 @@ size_t quantize_q5_1(const float * restrict src, void * restrict dst, int64_t nr
|
|
| 3302 |
size_t quantize_q8_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
| 3303 |
(void)quant_weights; // not used
|
| 3304 |
const size_t row_size = ggml_row_size(GGML_TYPE_Q8_0, n_per_row);
|
| 3305 |
-
|
| 3306 |
return nrow * row_size;
|
| 3307 |
}
|
| 3308 |
|
|
@@ -3590,7 +3590,7 @@ void dequantize_row_iq4_xs(const block_iq4_xs * restrict x, float * restrict y,
|
|
| 3590 |
|
| 3591 |
//===================================== Q8_K ==============================================
|
| 3592 |
|
| 3593 |
-
void
|
| 3594 |
assert(k % QK_K == 0);
|
| 3595 |
const int64_t nb = k / QK_K;
|
| 3596 |
|
|
@@ -3641,7 +3641,7 @@ void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int6
|
|
| 3641 |
}
|
| 3642 |
|
| 3643 |
void quantize_row_q8_K(const float * restrict x, void * restrict y, int64_t k) {
|
| 3644 |
-
|
| 3645 |
}
|
| 3646 |
|
| 3647 |
//===================================== Dot ptoducts =================================
|
|
@@ -13542,10 +13542,10 @@ size_t quantize_iq3_xxs(const float * restrict src, void * restrict dst, int64_t
|
|
| 13542 |
void quantize_row_iq3_xxs(const float * restrict x, void * restrict vy, int64_t k) {
|
| 13543 |
assert(k % QK_K == 0);
|
| 13544 |
block_iq3_xxs * restrict y = vy;
|
| 13545 |
-
|
| 13546 |
}
|
| 13547 |
|
| 13548 |
-
void
|
| 13549 |
assert(k % QK_K == 0);
|
| 13550 |
quantize_row_iq3_xxs_impl(256, x, y, k, NULL);
|
| 13551 |
}
|
|
@@ -13758,10 +13758,10 @@ size_t quantize_iq3_s(const float * restrict src, void * restrict dst, int64_t n
|
|
| 13758 |
void quantize_row_iq3_s(const float * restrict x, void * restrict vy, int64_t k) {
|
| 13759 |
assert(k % QK_K == 0);
|
| 13760 |
block_iq3_s * restrict y = vy;
|
| 13761 |
-
|
| 13762 |
}
|
| 13763 |
|
| 13764 |
-
void
|
| 13765 |
assert(k % QK_K == 0);
|
| 13766 |
quantize_iq3_s(x, y, 1, k, NULL);
|
| 13767 |
}
|
|
@@ -14499,7 +14499,7 @@ void quantize_row_iq4_nl(const float * restrict x, void * restrict vy, int64_t k
|
|
| 14499 |
}
|
| 14500 |
}
|
| 14501 |
|
| 14502 |
-
void
|
| 14503 |
assert(k % QK4_NL == 0);
|
| 14504 |
quantize_row_iq4_nl(x, y, k);
|
| 14505 |
}
|
|
@@ -14527,10 +14527,10 @@ size_t quantize_iq4_xs(const float * restrict src, void * restrict dst, int64_t
|
|
| 14527 |
void quantize_row_iq4_xs(const float * restrict x, void * restrict vy, int64_t k) {
|
| 14528 |
assert(k % QK_K == 0);
|
| 14529 |
block_iq4_xs * restrict y = vy;
|
| 14530 |
-
|
| 14531 |
}
|
| 14532 |
|
| 14533 |
-
void
|
| 14534 |
assert(k % QK_K == 0);
|
| 14535 |
quantize_iq4_xs(x, y, 1, k, NULL);
|
| 14536 |
}
|
|
@@ -14717,7 +14717,7 @@ size_t quantize_iq2_s(const float * restrict src, void * restrict dst, int64_t n
|
|
| 14717 |
return nrow * nblock * sizeof(block_iq2_s);
|
| 14718 |
}
|
| 14719 |
|
| 14720 |
-
void
|
| 14721 |
assert(k % QK_K == 0);
|
| 14722 |
quantize_iq2_s(x, y, 1, k, NULL);
|
| 14723 |
}
|
|
@@ -14725,7 +14725,7 @@ void quantize_row_iq2_s_reference(const float * restrict x, block_iq2_s * restri
|
|
| 14725 |
void quantize_row_iq2_s(const float * restrict x, void * restrict vy, int64_t k) {
|
| 14726 |
assert(k % QK_K == 0);
|
| 14727 |
block_iq2_s * restrict y = vy;
|
| 14728 |
-
|
| 14729 |
}
|
| 14730 |
|
| 14731 |
static bool validate_float(float f, size_t i) {
|
|
|
|
| 658 |
#endif //__loongarch_asx
|
| 659 |
|
| 660 |
// reference implementation for deterministic creation of model files
|
| 661 |
+
void quantize_row_q4_0_ref(const float * restrict x, block_q4_0 * restrict y, int64_t k) {
|
| 662 |
static const int qk = QK4_0;
|
| 663 |
|
| 664 |
assert(k % qk == 0);
|
|
|
|
| 696 |
}
|
| 697 |
|
| 698 |
void quantize_row_q4_0(const float * restrict x, void * restrict y, int64_t k) {
|
| 699 |
+
quantize_row_q4_0_ref(x, y, k);
|
| 700 |
}
|
| 701 |
|
| 702 |
|
| 703 |
+
void quantize_row_q4_1_ref(const float * restrict x, block_q4_1 * restrict y, int64_t k) {
|
| 704 |
const int qk = QK4_1;
|
| 705 |
|
| 706 |
assert(k % qk == 0);
|
|
|
|
| 738 |
}
|
| 739 |
|
| 740 |
void quantize_row_q4_1(const float * restrict x, void * restrict y, int64_t k) {
|
| 741 |
+
quantize_row_q4_1_ref(x, y, k);
|
| 742 |
}
|
| 743 |
|
| 744 |
+
void quantize_row_q5_0_ref(const float * restrict x, block_q5_0 * restrict y, int64_t k) {
|
| 745 |
static const int qk = QK5_0;
|
| 746 |
|
| 747 |
assert(k % qk == 0);
|
|
|
|
| 786 |
}
|
| 787 |
|
| 788 |
void quantize_row_q5_0(const float * restrict x, void * restrict y, int64_t k) {
|
| 789 |
+
quantize_row_q5_0_ref(x, y, k);
|
| 790 |
}
|
| 791 |
|
| 792 |
+
void quantize_row_q5_1_ref(const float * restrict x, block_q5_1 * restrict y, int64_t k) {
|
| 793 |
const int qk = QK5_1;
|
| 794 |
|
| 795 |
assert(k % qk == 0);
|
|
|
|
| 834 |
}
|
| 835 |
|
| 836 |
void quantize_row_q5_1(const float * restrict x, void * restrict y, int64_t k) {
|
| 837 |
+
quantize_row_q5_1_ref(x, y, k);
|
| 838 |
}
|
| 839 |
|
| 840 |
// reference implementation for deterministic creation of model files
|
| 841 |
+
void quantize_row_q8_0_ref(const float * restrict x, block_q8_0 * restrict y, int64_t k) {
|
| 842 |
assert(k % QK8_0 == 0);
|
| 843 |
const int nb = k / QK8_0;
|
| 844 |
|
|
|
|
| 1144 |
#else
|
| 1145 |
GGML_UNUSED(nb);
|
| 1146 |
// scalar
|
| 1147 |
+
quantize_row_q8_0_ref(x, y, k);
|
| 1148 |
#endif
|
| 1149 |
}
|
| 1150 |
|
| 1151 |
// reference implementation for deterministic creation of model files
|
| 1152 |
+
void quantize_row_q8_1_ref(const float * restrict x, block_q8_1 * restrict y, int64_t k) {
|
| 1153 |
assert(QK8_1 == 32);
|
| 1154 |
assert(k % QK8_1 == 0);
|
| 1155 |
const int nb = k / QK8_1;
|
|
|
|
| 1508 |
#else
|
| 1509 |
GGML_UNUSED(nb);
|
| 1510 |
// scalar
|
| 1511 |
+
quantize_row_q8_1_ref(x, y, k);
|
| 1512 |
#endif
|
| 1513 |
}
|
| 1514 |
|
|
|
|
| 1899 |
|
| 1900 |
//========================- 2-bit (de)-quantization
|
| 1901 |
|
| 1902 |
+
void quantize_row_q2_K_ref(const float * restrict x, block_q2_K * restrict y, int64_t k) {
|
| 1903 |
assert(k % QK_K == 0);
|
| 1904 |
const int nb = k / QK_K;
|
| 1905 |
|
|
|
|
| 2002 |
}
|
| 2003 |
|
| 2004 |
void quantize_row_q2_K(const float * restrict x, void * restrict vy, int64_t k) {
|
| 2005 |
+
quantize_row_q2_K_ref(x, vy, k);
|
| 2006 |
}
|
| 2007 |
|
| 2008 |
static float make_qkx3_quants(int n, int nmax, const float * restrict x, const float * restrict weights,
|
|
|
|
| 2226 |
size_t quantize_q2_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
| 2227 |
size_t row_size = ggml_row_size(GGML_TYPE_Q2_K, n_per_row);
|
| 2228 |
if (!quant_weights) {
|
| 2229 |
+
quantize_row_q2_K_ref(src, dst, (int64_t)nrow*n_per_row);
|
| 2230 |
}
|
| 2231 |
else {
|
| 2232 |
char * qrow = (char *)dst;
|
|
|
|
| 2241 |
|
| 2242 |
//========================= 3-bit (de)-quantization
|
| 2243 |
|
| 2244 |
+
void quantize_row_q3_K_ref(const float * restrict x, block_q3_K * restrict y, int64_t k) {
|
| 2245 |
assert(k % QK_K == 0);
|
| 2246 |
const int nb = k / QK_K;
|
| 2247 |
|
|
|
|
| 2368 |
}
|
| 2369 |
|
| 2370 |
void quantize_row_q3_K(const float * restrict x, void * restrict vy, int64_t k) {
|
| 2371 |
+
quantize_row_q3_K_ref(x, vy, k);
|
| 2372 |
}
|
| 2373 |
|
| 2374 |
static void quantize_row_q3_K_impl(const float * restrict x, block_q3_K * restrict y, int64_t n_per_row, const float * restrict quant_weights) {
|
|
|
|
| 2458 |
size_t quantize_q3_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
| 2459 |
size_t row_size = ggml_row_size(GGML_TYPE_Q3_K, n_per_row);
|
| 2460 |
if (!quant_weights) {
|
| 2461 |
+
quantize_row_q3_K_ref(src, dst, (int64_t)nrow*n_per_row);
|
| 2462 |
}
|
| 2463 |
else {
|
| 2464 |
char * qrow = (char *)dst;
|
|
|
|
| 2473 |
|
| 2474 |
// ====================== 4-bit (de)-quantization
|
| 2475 |
|
| 2476 |
+
void quantize_row_q4_K_ref(const float * restrict x, block_q4_K * restrict y, int64_t k) {
|
| 2477 |
assert(k % QK_K == 0);
|
| 2478 |
const int nb = k / QK_K;
|
| 2479 |
|
|
|
|
| 2572 |
void quantize_row_q4_K(const float * restrict x, void * restrict vy, int64_t k) {
|
| 2573 |
assert(k % QK_K == 0);
|
| 2574 |
block_q4_K * restrict y = vy;
|
| 2575 |
+
quantize_row_q4_K_ref(x, y, k);
|
| 2576 |
}
|
| 2577 |
|
| 2578 |
static void quantize_row_q4_K_impl(const float * restrict x, block_q4_K * restrict y, int64_t n_per_row, const float * quant_weights) {
|
|
|
|
| 2651 |
size_t quantize_q4_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
| 2652 |
size_t row_size = ggml_row_size(GGML_TYPE_Q4_K, n_per_row);
|
| 2653 |
if (!quant_weights) {
|
| 2654 |
+
quantize_row_q4_K_ref(src, dst, (int64_t)nrow*n_per_row);
|
| 2655 |
}
|
| 2656 |
else {
|
| 2657 |
char * qrow = (char *)dst;
|
|
|
|
| 2666 |
|
| 2667 |
// ====================== 5-bit (de)-quantization
|
| 2668 |
|
| 2669 |
+
void quantize_row_q5_K_ref(const float * restrict x, block_q5_K * restrict y, int64_t k) {
|
| 2670 |
assert(k % QK_K == 0);
|
| 2671 |
const int64_t nb = k / QK_K;
|
| 2672 |
|
|
|
|
| 2783 |
void quantize_row_q5_K(const float * restrict x, void * restrict vy, int64_t k) {
|
| 2784 |
assert(k % QK_K == 0);
|
| 2785 |
block_q5_K * restrict y = vy;
|
| 2786 |
+
quantize_row_q5_K_ref(x, y, k);
|
| 2787 |
}
|
| 2788 |
|
| 2789 |
static void quantize_row_q5_K_impl(const float * restrict x, block_q5_K * restrict y, int64_t n_per_row, const float * quant_weights) {
|
|
|
|
| 2882 |
size_t quantize_q5_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
| 2883 |
size_t row_size = ggml_row_size(GGML_TYPE_Q5_K, n_per_row);
|
| 2884 |
if (!quant_weights) {
|
| 2885 |
+
quantize_row_q5_K_ref(src, dst, (int64_t)nrow*n_per_row);
|
| 2886 |
}
|
| 2887 |
else {
|
| 2888 |
char * qrow = (char *)dst;
|
|
|
|
| 2897 |
|
| 2898 |
// ====================== 6-bit (de)-quantization
|
| 2899 |
|
| 2900 |
+
void quantize_row_q6_K_ref(const float * restrict x, block_q6_K * restrict y, int64_t k) {
|
| 2901 |
assert(k % QK_K == 0);
|
| 2902 |
const int64_t nb = k / QK_K;
|
| 2903 |
|
|
|
|
| 3001 |
void quantize_row_q6_K(const float * restrict x, void * restrict vy, int64_t k) {
|
| 3002 |
assert(k % QK_K == 0);
|
| 3003 |
block_q6_K * restrict y = vy;
|
| 3004 |
+
quantize_row_q6_K_ref(x, y, k);
|
| 3005 |
}
|
| 3006 |
|
| 3007 |
static void quantize_row_q6_K_impl(const float * restrict x, block_q6_K * restrict y, int64_t n_per_row, const float * quant_weights) {
|
|
|
|
| 3091 |
size_t quantize_q6_K(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
| 3092 |
size_t row_size = ggml_row_size(GGML_TYPE_Q6_K, n_per_row);
|
| 3093 |
if (!quant_weights) {
|
| 3094 |
+
quantize_row_q6_K_ref(src, dst, (int64_t)nrow*n_per_row);
|
| 3095 |
}
|
| 3096 |
else {
|
| 3097 |
char * qrow = (char *)dst;
|
|
|
|
| 3108 |
static_assert(QK4_0 == 32, "QK4_0 must be 32");
|
| 3109 |
|
| 3110 |
if (!quant_weights) {
|
| 3111 |
+
quantize_row_q4_0_ref(x, y, n_per_row);
|
| 3112 |
return;
|
| 3113 |
}
|
| 3114 |
|
|
|
|
| 3134 |
|
| 3135 |
size_t quantize_q4_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
| 3136 |
if (!quant_weights) {
|
| 3137 |
+
quantize_row_q4_0_ref(src, dst, (int64_t)nrow*n_per_row);
|
| 3138 |
return nrow * ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
|
| 3139 |
}
|
| 3140 |
size_t row_size = ggml_row_size(GGML_TYPE_Q4_0, n_per_row);
|
|
|
|
| 3151 |
static_assert(QK4_1 == 32, "QK4_1 must be 32");
|
| 3152 |
|
| 3153 |
if (!quant_weights) {
|
| 3154 |
+
quantize_row_q4_1_ref(x, y, n_per_row);
|
| 3155 |
return;
|
| 3156 |
}
|
| 3157 |
|
|
|
|
| 3179 |
|
| 3180 |
size_t quantize_q4_1(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
| 3181 |
if (!quant_weights) {
|
| 3182 |
+
quantize_row_q4_1_ref(src, dst, (int64_t)nrow*n_per_row);
|
| 3183 |
return nrow * ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
|
| 3184 |
}
|
| 3185 |
size_t row_size = ggml_row_size(GGML_TYPE_Q4_1, n_per_row);
|
|
|
|
| 3196 |
static_assert(QK5_0 == 32, "QK5_0 must be 32");
|
| 3197 |
|
| 3198 |
if (!quant_weights) {
|
| 3199 |
+
quantize_row_q5_0_ref(x, y, n_per_row);
|
| 3200 |
return;
|
| 3201 |
}
|
| 3202 |
|
|
|
|
| 3233 |
|
| 3234 |
size_t quantize_q5_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
| 3235 |
if (!quant_weights) {
|
| 3236 |
+
quantize_row_q5_0_ref(src, dst, (int64_t)nrow*n_per_row);
|
| 3237 |
return nrow * ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
|
| 3238 |
}
|
| 3239 |
size_t row_size = ggml_row_size(GGML_TYPE_Q5_0, n_per_row);
|
|
|
|
| 3250 |
static_assert(QK5_1 == 32, "QK5_1 must be 32");
|
| 3251 |
|
| 3252 |
if (!quant_weights) {
|
| 3253 |
+
quantize_row_q5_1_ref(x, y, n_per_row);
|
| 3254 |
return;
|
| 3255 |
}
|
| 3256 |
|
|
|
|
| 3286 |
|
| 3287 |
size_t quantize_q5_1(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
| 3288 |
if (!quant_weights) {
|
| 3289 |
+
quantize_row_q5_1_ref(src, dst, (int64_t)nrow*n_per_row);
|
| 3290 |
return nrow * ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
|
| 3291 |
}
|
| 3292 |
size_t row_size = ggml_row_size(GGML_TYPE_Q5_1, n_per_row);
|
|
|
|
| 3302 |
size_t quantize_q8_0(const float * restrict src, void * restrict dst, int64_t nrow, int64_t n_per_row, const float * quant_weights) {
|
| 3303 |
(void)quant_weights; // not used
|
| 3304 |
const size_t row_size = ggml_row_size(GGML_TYPE_Q8_0, n_per_row);
|
| 3305 |
+
quantize_row_q8_0_ref(src, dst, (int64_t)nrow*n_per_row);
|
| 3306 |
return nrow * row_size;
|
| 3307 |
}
|
| 3308 |
|
|
|
|
| 3590 |
|
| 3591 |
//===================================== Q8_K ==============================================
|
| 3592 |
|
| 3593 |
+
void quantize_row_q8_K_ref(const float * restrict x, block_q8_K * restrict y, int64_t k) {
|
| 3594 |
assert(k % QK_K == 0);
|
| 3595 |
const int64_t nb = k / QK_K;
|
| 3596 |
|
|
|
|
| 3641 |
}
|
| 3642 |
|
| 3643 |
void quantize_row_q8_K(const float * restrict x, void * restrict y, int64_t k) {
|
| 3644 |
+
quantize_row_q8_K_ref(x, y, k);
|
| 3645 |
}
|
| 3646 |
|
| 3647 |
//===================================== Dot ptoducts =================================
|
|
|
|
| 13542 |
void quantize_row_iq3_xxs(const float * restrict x, void * restrict vy, int64_t k) {
|
| 13543 |
assert(k % QK_K == 0);
|
| 13544 |
block_iq3_xxs * restrict y = vy;
|
| 13545 |
+
quantize_row_iq3_xxs_ref(x, y, k);
|
| 13546 |
}
|
| 13547 |
|
| 13548 |
+
void quantize_row_iq3_xxs_ref(const float * restrict x, block_iq3_xxs * restrict y, int64_t k) {
|
| 13549 |
assert(k % QK_K == 0);
|
| 13550 |
quantize_row_iq3_xxs_impl(256, x, y, k, NULL);
|
| 13551 |
}
|
|
|
|
| 13758 |
void quantize_row_iq3_s(const float * restrict x, void * restrict vy, int64_t k) {
|
| 13759 |
assert(k % QK_K == 0);
|
| 13760 |
block_iq3_s * restrict y = vy;
|
| 13761 |
+
quantize_row_iq3_s_ref(x, y, k);
|
| 13762 |
}
|
| 13763 |
|
| 13764 |
+
void quantize_row_iq3_s_ref(const float * restrict x, block_iq3_s * restrict y, int64_t k) {
|
| 13765 |
assert(k % QK_K == 0);
|
| 13766 |
quantize_iq3_s(x, y, 1, k, NULL);
|
| 13767 |
}
|
|
|
|
| 14499 |
}
|
| 14500 |
}
|
| 14501 |
|
| 14502 |
+
void quantize_row_iq4_nl_ref(const float * restrict x, block_iq4_nl * restrict y, int64_t k) {
|
| 14503 |
assert(k % QK4_NL == 0);
|
| 14504 |
quantize_row_iq4_nl(x, y, k);
|
| 14505 |
}
|
|
|
|
| 14527 |
void quantize_row_iq4_xs(const float * restrict x, void * restrict vy, int64_t k) {
|
| 14528 |
assert(k % QK_K == 0);
|
| 14529 |
block_iq4_xs * restrict y = vy;
|
| 14530 |
+
quantize_row_iq4_xs_ref(x, y, k);
|
| 14531 |
}
|
| 14532 |
|
| 14533 |
+
void quantize_row_iq4_xs_ref(const float * restrict x, block_iq4_xs * restrict y, int64_t k) {
|
| 14534 |
assert(k % QK_K == 0);
|
| 14535 |
quantize_iq4_xs(x, y, 1, k, NULL);
|
| 14536 |
}
|
|
|
|
| 14717 |
return nrow * nblock * sizeof(block_iq2_s);
|
| 14718 |
}
|
| 14719 |
|
| 14720 |
+
void quantize_row_iq2_s_ref(const float * restrict x, block_iq2_s * restrict y, int64_t k) {
|
| 14721 |
assert(k % QK_K == 0);
|
| 14722 |
quantize_iq2_s(x, y, 1, k, NULL);
|
| 14723 |
}
|
|
|
|
| 14725 |
void quantize_row_iq2_s(const float * restrict x, void * restrict vy, int64_t k) {
|
| 14726 |
assert(k % QK_K == 0);
|
| 14727 |
block_iq2_s * restrict y = vy;
|
| 14728 |
+
quantize_row_iq2_s_ref(x, y, k);
|
| 14729 |
}
|
| 14730 |
|
| 14731 |
static bool validate_float(float f, size_t i) {
|
ggml/src/ggml-quants.h
CHANGED
|
@@ -12,25 +12,25 @@ extern "C" {
|
|
| 12 |
#endif
|
| 13 |
|
| 14 |
// Quantization
|
| 15 |
-
void
|
| 16 |
-
void
|
| 17 |
-
void
|
| 18 |
-
void
|
| 19 |
-
void
|
| 20 |
-
void
|
| 21 |
-
|
| 22 |
-
void
|
| 23 |
-
void
|
| 24 |
-
void
|
| 25 |
-
void
|
| 26 |
-
void
|
| 27 |
-
void
|
| 28 |
-
|
| 29 |
-
void
|
| 30 |
-
void
|
| 31 |
-
void
|
| 32 |
-
void
|
| 33 |
-
void
|
| 34 |
|
| 35 |
void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
| 36 |
void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
|
|
|
| 12 |
#endif
|
| 13 |
|
| 14 |
// Quantization
|
| 15 |
+
void quantize_row_q4_0_ref(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k);
|
| 16 |
+
void quantize_row_q4_1_ref(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t k);
|
| 17 |
+
void quantize_row_q5_0_ref(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t k);
|
| 18 |
+
void quantize_row_q5_1_ref(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t k);
|
| 19 |
+
void quantize_row_q8_0_ref(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int64_t k);
|
| 20 |
+
void quantize_row_q8_1_ref(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int64_t k);
|
| 21 |
+
|
| 22 |
+
void quantize_row_q2_K_ref(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k);
|
| 23 |
+
void quantize_row_q3_K_ref(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k);
|
| 24 |
+
void quantize_row_q4_K_ref(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t k);
|
| 25 |
+
void quantize_row_q5_K_ref(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t k);
|
| 26 |
+
void quantize_row_q6_K_ref(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k);
|
| 27 |
+
void quantize_row_q8_K_ref(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k);
|
| 28 |
+
|
| 29 |
+
void quantize_row_iq3_xxs_ref(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k);
|
| 30 |
+
void quantize_row_iq4_nl_ref (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int64_t k);
|
| 31 |
+
void quantize_row_iq4_xs_ref (const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int64_t k);
|
| 32 |
+
void quantize_row_iq3_s_ref (const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int64_t k);
|
| 33 |
+
void quantize_row_iq2_s_ref (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int64_t k);
|
| 34 |
|
| 35 |
void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
| 36 |
void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
ggml/src/ggml.c
CHANGED
|
@@ -592,7 +592,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
| 592 |
.is_quantized = false,
|
| 593 |
.to_float = (ggml_to_float_t) ggml_fp16_to_fp32_row,
|
| 594 |
.from_float = (ggml_from_float_t) ggml_fp32_to_fp16_row,
|
| 595 |
-
.
|
| 596 |
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16,
|
| 597 |
.vec_dot_type = GGML_TYPE_F16,
|
| 598 |
.nrows = 1,
|
|
@@ -604,7 +604,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
| 604 |
.is_quantized = true,
|
| 605 |
.to_float = (ggml_to_float_t) dequantize_row_q4_0,
|
| 606 |
.from_float = quantize_row_q4_0,
|
| 607 |
-
.
|
| 608 |
.vec_dot = ggml_vec_dot_q4_0_q8_0,
|
| 609 |
.vec_dot_type = GGML_TYPE_Q8_0,
|
| 610 |
#if defined (__ARM_FEATURE_MATMUL_INT8)
|
|
@@ -620,7 +620,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
| 620 |
.is_quantized = true,
|
| 621 |
.to_float = (ggml_to_float_t) dequantize_row_q4_1,
|
| 622 |
.from_float = quantize_row_q4_1,
|
| 623 |
-
.
|
| 624 |
.vec_dot = ggml_vec_dot_q4_1_q8_1,
|
| 625 |
.vec_dot_type = GGML_TYPE_Q8_1,
|
| 626 |
#if defined (__ARM_FEATURE_MATMUL_INT8)
|
|
@@ -636,7 +636,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
| 636 |
.is_quantized = false,
|
| 637 |
.to_float = NULL,
|
| 638 |
.from_float = NULL,
|
| 639 |
-
.
|
| 640 |
.vec_dot = NULL,
|
| 641 |
.vec_dot_type = GGML_TYPE_COUNT,
|
| 642 |
.nrows = 1,
|
|
@@ -648,7 +648,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
| 648 |
.is_quantized = false,
|
| 649 |
.to_float = NULL,
|
| 650 |
.from_float = NULL,
|
| 651 |
-
.
|
| 652 |
.vec_dot = NULL,
|
| 653 |
.vec_dot_type = GGML_TYPE_COUNT,
|
| 654 |
.nrows = 1,
|
|
@@ -660,7 +660,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
| 660 |
.is_quantized = true,
|
| 661 |
.to_float = (ggml_to_float_t) dequantize_row_q5_0,
|
| 662 |
.from_float = quantize_row_q5_0,
|
| 663 |
-
.
|
| 664 |
.vec_dot = ggml_vec_dot_q5_0_q8_0,
|
| 665 |
.vec_dot_type = GGML_TYPE_Q8_0,
|
| 666 |
.nrows = 1,
|
|
@@ -672,7 +672,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
| 672 |
.is_quantized = true,
|
| 673 |
.to_float = (ggml_to_float_t) dequantize_row_q5_1,
|
| 674 |
.from_float = quantize_row_q5_1,
|
| 675 |
-
.
|
| 676 |
.vec_dot = ggml_vec_dot_q5_1_q8_1,
|
| 677 |
.vec_dot_type = GGML_TYPE_Q8_1,
|
| 678 |
.nrows = 1,
|
|
@@ -684,7 +684,8 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
| 684 |
.is_quantized = true,
|
| 685 |
.to_float = (ggml_to_float_t) dequantize_row_q8_0,
|
| 686 |
.from_float = quantize_row_q8_0,
|
| 687 |
-
.
|
|
|
|
| 688 |
.vec_dot = ggml_vec_dot_q8_0_q8_0,
|
| 689 |
.vec_dot_type = GGML_TYPE_Q8_0,
|
| 690 |
#if defined (__ARM_FEATURE_MATMUL_INT8)
|
|
@@ -692,7 +693,6 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
| 692 |
#else
|
| 693 |
.nrows = 1,
|
| 694 |
#endif
|
| 695 |
-
.from_float_to_mat = quantize_mat_q8_0,
|
| 696 |
},
|
| 697 |
[GGML_TYPE_Q8_1] = {
|
| 698 |
.type_name = "q8_1",
|
|
@@ -700,7 +700,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
| 700 |
.type_size = sizeof(block_q8_1),
|
| 701 |
.is_quantized = true,
|
| 702 |
.from_float = quantize_row_q8_1,
|
| 703 |
-
.
|
| 704 |
.vec_dot_type = GGML_TYPE_Q8_1,
|
| 705 |
.nrows = 1,
|
| 706 |
},
|
|
@@ -711,7 +711,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
| 711 |
.is_quantized = true,
|
| 712 |
.to_float = (ggml_to_float_t) dequantize_row_q2_K,
|
| 713 |
.from_float = quantize_row_q2_K,
|
| 714 |
-
.
|
| 715 |
.vec_dot = ggml_vec_dot_q2_K_q8_K,
|
| 716 |
.vec_dot_type = GGML_TYPE_Q8_K,
|
| 717 |
.nrows = 1,
|
|
@@ -723,7 +723,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
| 723 |
.is_quantized = true,
|
| 724 |
.to_float = (ggml_to_float_t) dequantize_row_q3_K,
|
| 725 |
.from_float = quantize_row_q3_K,
|
| 726 |
-
.
|
| 727 |
.vec_dot = ggml_vec_dot_q3_K_q8_K,
|
| 728 |
.vec_dot_type = GGML_TYPE_Q8_K,
|
| 729 |
.nrows = 1,
|
|
@@ -735,7 +735,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
| 735 |
.is_quantized = true,
|
| 736 |
.to_float = (ggml_to_float_t) dequantize_row_q4_K,
|
| 737 |
.from_float = quantize_row_q4_K,
|
| 738 |
-
.
|
| 739 |
.vec_dot = ggml_vec_dot_q4_K_q8_K,
|
| 740 |
.vec_dot_type = GGML_TYPE_Q8_K,
|
| 741 |
.nrows = 1,
|
|
@@ -747,7 +747,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
| 747 |
.is_quantized = true,
|
| 748 |
.to_float = (ggml_to_float_t) dequantize_row_q5_K,
|
| 749 |
.from_float = quantize_row_q5_K,
|
| 750 |
-
.
|
| 751 |
.vec_dot = ggml_vec_dot_q5_K_q8_K,
|
| 752 |
.vec_dot_type = GGML_TYPE_Q8_K,
|
| 753 |
.nrows = 1,
|
|
@@ -759,7 +759,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
| 759 |
.is_quantized = true,
|
| 760 |
.to_float = (ggml_to_float_t) dequantize_row_q6_K,
|
| 761 |
.from_float = quantize_row_q6_K,
|
| 762 |
-
.
|
| 763 |
.vec_dot = ggml_vec_dot_q6_K_q8_K,
|
| 764 |
.vec_dot_type = GGML_TYPE_Q8_K,
|
| 765 |
.nrows = 1,
|
|
@@ -771,7 +771,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
| 771 |
.is_quantized = true,
|
| 772 |
.to_float = (ggml_to_float_t) dequantize_row_iq2_xxs,
|
| 773 |
.from_float = NULL,
|
| 774 |
-
.
|
| 775 |
.vec_dot = ggml_vec_dot_iq2_xxs_q8_K,
|
| 776 |
.vec_dot_type = GGML_TYPE_Q8_K,
|
| 777 |
.nrows = 1,
|
|
@@ -783,7 +783,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
| 783 |
.is_quantized = true,
|
| 784 |
.to_float = (ggml_to_float_t) dequantize_row_iq2_xs,
|
| 785 |
.from_float = NULL,
|
| 786 |
-
.
|
| 787 |
.vec_dot = ggml_vec_dot_iq2_xs_q8_K,
|
| 788 |
.vec_dot_type = GGML_TYPE_Q8_K,
|
| 789 |
.nrows = 1,
|
|
@@ -795,7 +795,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
| 795 |
.is_quantized = true,
|
| 796 |
.to_float = (ggml_to_float_t) dequantize_row_iq3_xxs,
|
| 797 |
.from_float = quantize_row_iq3_xxs,
|
| 798 |
-
.
|
| 799 |
.vec_dot = ggml_vec_dot_iq3_xxs_q8_K,
|
| 800 |
.vec_dot_type = GGML_TYPE_Q8_K,
|
| 801 |
.nrows = 1,
|
|
@@ -807,7 +807,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
| 807 |
.is_quantized = true,
|
| 808 |
.to_float = (ggml_to_float_t) dequantize_row_iq3_s,
|
| 809 |
.from_float = quantize_row_iq3_s,
|
| 810 |
-
.
|
| 811 |
.vec_dot = ggml_vec_dot_iq3_s_q8_K,
|
| 812 |
.vec_dot_type = GGML_TYPE_Q8_K,
|
| 813 |
.nrows = 1,
|
|
@@ -819,7 +819,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
| 819 |
.is_quantized = true,
|
| 820 |
.to_float = (ggml_to_float_t) dequantize_row_iq2_s,
|
| 821 |
.from_float = quantize_row_iq2_s,
|
| 822 |
-
.
|
| 823 |
.vec_dot = ggml_vec_dot_iq2_s_q8_K,
|
| 824 |
.vec_dot_type = GGML_TYPE_Q8_K,
|
| 825 |
.nrows = 1,
|
|
@@ -831,7 +831,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
| 831 |
.is_quantized = true,
|
| 832 |
.to_float = (ggml_to_float_t) dequantize_row_iq1_s,
|
| 833 |
.from_float = NULL,
|
| 834 |
-
.
|
| 835 |
.vec_dot = ggml_vec_dot_iq1_s_q8_K,
|
| 836 |
.vec_dot_type = GGML_TYPE_Q8_K,
|
| 837 |
.nrows = 1,
|
|
@@ -843,7 +843,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
| 843 |
.is_quantized = true,
|
| 844 |
.to_float = (ggml_to_float_t) dequantize_row_iq1_m,
|
| 845 |
.from_float = NULL,
|
| 846 |
-
.
|
| 847 |
.vec_dot = ggml_vec_dot_iq1_m_q8_K,
|
| 848 |
.vec_dot_type = GGML_TYPE_Q8_K,
|
| 849 |
.nrows = 1,
|
|
@@ -855,7 +855,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
| 855 |
.is_quantized = true,
|
| 856 |
.to_float = (ggml_to_float_t) dequantize_row_iq4_nl,
|
| 857 |
.from_float = quantize_row_iq4_nl,
|
| 858 |
-
.
|
| 859 |
.vec_dot = ggml_vec_dot_iq4_nl_q8_0,
|
| 860 |
.vec_dot_type = GGML_TYPE_Q8_0,
|
| 861 |
.nrows = 1,
|
|
@@ -867,7 +867,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
| 867 |
.is_quantized = true,
|
| 868 |
.to_float = (ggml_to_float_t) dequantize_row_iq4_xs,
|
| 869 |
.from_float = quantize_row_iq4_xs,
|
| 870 |
-
.
|
| 871 |
.vec_dot = ggml_vec_dot_iq4_xs_q8_K,
|
| 872 |
.vec_dot_type = GGML_TYPE_Q8_K,
|
| 873 |
.nrows = 1,
|
|
@@ -886,7 +886,7 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
| 886 |
.is_quantized = false,
|
| 887 |
.to_float = (ggml_to_float_t) ggml_bf16_to_fp32_row,
|
| 888 |
.from_float = (ggml_from_float_t) ggml_fp32_to_bf16_row,
|
| 889 |
-
.
|
| 890 |
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_bf16,
|
| 891 |
.vec_dot_type = GGML_TYPE_BF16,
|
| 892 |
.nrows = 1,
|
|
@@ -894,48 +894,48 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = {
|
|
| 894 |
[GGML_TYPE_Q4_0_4_4] = {
|
| 895 |
.type_name = "q4_0_4x4",
|
| 896 |
.blck_size = QK4_0,
|
|
|
|
| 897 |
.type_size = sizeof(block_q4_0),
|
| 898 |
.is_quantized = true,
|
| 899 |
.to_float = NULL,
|
| 900 |
.from_float = NULL,
|
| 901 |
-
.
|
| 902 |
.vec_dot = NULL,
|
| 903 |
.vec_dot_type = GGML_TYPE_Q8_0,
|
| 904 |
.nrows = 1,
|
| 905 |
.ncols = 4,
|
| 906 |
-
.interleave_blcksize = 4,
|
| 907 |
.gemv = ggml_gemv_q4_0_4x4_q8_0,
|
| 908 |
.gemm = ggml_gemm_q4_0_4x4_q8_0,
|
| 909 |
},
|
| 910 |
[GGML_TYPE_Q4_0_4_8] = {
|
| 911 |
.type_name = "q4_0_4x8",
|
| 912 |
.blck_size = QK4_0,
|
|
|
|
| 913 |
.type_size = sizeof(block_q4_0),
|
| 914 |
.is_quantized = true,
|
| 915 |
.to_float = NULL,
|
| 916 |
.from_float = NULL,
|
| 917 |
-
.
|
| 918 |
.vec_dot = NULL,
|
| 919 |
.vec_dot_type = GGML_TYPE_Q8_0,
|
| 920 |
.nrows = 1,
|
| 921 |
.ncols = 4,
|
| 922 |
-
.interleave_blcksize = 8,
|
| 923 |
.gemv = ggml_gemv_q4_0_4x8_q8_0,
|
| 924 |
.gemm = ggml_gemm_q4_0_4x8_q8_0,
|
| 925 |
},
|
| 926 |
[GGML_TYPE_Q4_0_8_8] = {
|
| 927 |
.type_name = "q4_0_8x8",
|
| 928 |
.blck_size = QK4_0,
|
|
|
|
| 929 |
.type_size = sizeof(block_q4_0),
|
| 930 |
.is_quantized = true,
|
| 931 |
.to_float = NULL,
|
| 932 |
.from_float = NULL,
|
| 933 |
-
.
|
| 934 |
.vec_dot = NULL,
|
| 935 |
.vec_dot_type = GGML_TYPE_Q8_0,
|
| 936 |
.nrows = 1,
|
| 937 |
.ncols = 8,
|
| 938 |
-
.interleave_blcksize = 8,
|
| 939 |
.gemv = ggml_gemv_q4_0_8x8_q8_0,
|
| 940 |
.gemm = ggml_gemm_q4_0_8x8_q8_0,
|
| 941 |
}
|
|
@@ -3115,7 +3115,7 @@ size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
|
|
| 3115 |
return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
|
| 3116 |
}
|
| 3117 |
|
| 3118 |
-
GGML_CALL
|
| 3119 |
return type_traits[type].blck_size;
|
| 3120 |
}
|
| 3121 |
|
|
@@ -12192,15 +12192,14 @@ static void ggml_compute_forward_mul_mat(
|
|
| 12192 |
|
| 12193 |
const enum ggml_type type = src0->type;
|
| 12194 |
|
| 12195 |
-
enum ggml_type
|
| 12196 |
-
ggml_from_float_t
|
| 12197 |
-
|
| 12198 |
-
int64_t
|
| 12199 |
-
int64_t
|
| 12200 |
-
|
| 12201 |
-
|
| 12202 |
-
|
| 12203 |
-
ggml_gemm_t const gemm = type_traits[type].gemm;
|
| 12204 |
|
| 12205 |
GGML_ASSERT(ne0 == ne01);
|
| 12206 |
GGML_ASSERT(ne1 == ne11);
|
|
@@ -12264,14 +12263,14 @@ UseGgmlGemm1:;
|
|
| 12264 |
for (int64_t i11 = ith * 4; i11 < ne11 - ne11 % 4; i11 += nth * 4) {
|
| 12265 |
from_float_to_mat((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
|
| 12266 |
(void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
|
| 12267 |
-
4, ne10,
|
| 12268 |
}
|
| 12269 |
i11_processed = ne11 - ne11 % 4;
|
| 12270 |
}
|
| 12271 |
for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) {
|
| 12272 |
-
|
| 12273 |
-
|
| 12274 |
-
|
| 12275 |
}
|
| 12276 |
}
|
| 12277 |
}
|
|
@@ -12355,7 +12354,7 @@ UseGgmlGemm2:;
|
|
| 12355 |
int64_t src0_start = (ith * ne01) / nth;
|
| 12356 |
int64_t src0_end = ((ith + 1) * ne01) / nth;
|
| 12357 |
src0_start = (src0_start % matmul_num_cols) ? src0_start + matmul_num_cols - (src0_start % matmul_num_cols): src0_start;
|
| 12358 |
-
src0_end = (src0_end
|
| 12359 |
if (src0_start >= src0_end) return;
|
| 12360 |
|
| 12361 |
// If there are more than three rows in src1, use gemm; otherwise, use gemv.
|
|
@@ -12413,11 +12412,11 @@ static void ggml_compute_forward_mul_mat_id(
|
|
| 12413 |
|
| 12414 |
const bool src1_cont = ggml_is_contiguous(src1);
|
| 12415 |
|
| 12416 |
-
ggml_vec_dot_t const vec_dot
|
| 12417 |
-
enum ggml_type const vec_dot_type
|
| 12418 |
-
ggml_from_float_t const
|
| 12419 |
-
int64_t const matmul_num_cols
|
| 12420 |
-
ggml_gemv_t const gemv
|
| 12421 |
|
| 12422 |
// we don't support permuted src0 or src1
|
| 12423 |
GGML_ASSERT(nb00 == ggml_type_size(type));
|
|
@@ -12458,9 +12457,9 @@ static void ggml_compute_forward_mul_mat_id(
|
|
| 12458 |
for (int64_t i13 = 0; i13 < ne13; ++i13) {
|
| 12459 |
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
| 12460 |
for (int64_t i11 = ith; i11 < ne11; i11 += nth) {
|
| 12461 |
-
|
| 12462 |
-
|
| 12463 |
-
|
| 12464 |
}
|
| 12465 |
}
|
| 12466 |
}
|
|
@@ -21062,8 +21061,8 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
| 21062 |
(int64_t) info->ne[3];
|
| 21063 |
|
| 21064 |
if (ne % ggml_blck_size(info->type) != 0) {
|
| 21065 |
-
fprintf(stderr, "%s: tensor '%s' of type %d (%s) number of elements (%" PRId64 ") is not a multiple of block size (%
|
| 21066 |
-
__func__, info->name.data, (int)info->type, ggml_type_name(info->type), ne, ggml_blck_size(info->type));
|
| 21067 |
fclose(file);
|
| 21068 |
gguf_free(ctx);
|
| 21069 |
return NULL;
|
|
|
|
| 592 |
.is_quantized = false,
|
| 593 |
.to_float = (ggml_to_float_t) ggml_fp16_to_fp32_row,
|
| 594 |
.from_float = (ggml_from_float_t) ggml_fp32_to_fp16_row,
|
| 595 |
+
.from_float_ref = (ggml_from_float_t) ggml_fp32_to_fp16_row,
|
| 596 |
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16,
|
| 597 |
.vec_dot_type = GGML_TYPE_F16,
|
| 598 |
.nrows = 1,
|
|
|
|
| 604 |
.is_quantized = true,
|
| 605 |
.to_float = (ggml_to_float_t) dequantize_row_q4_0,
|
| 606 |
.from_float = quantize_row_q4_0,
|
| 607 |
+
.from_float_ref = (ggml_from_float_t) quantize_row_q4_0_ref,
|
| 608 |
.vec_dot = ggml_vec_dot_q4_0_q8_0,
|
| 609 |
.vec_dot_type = GGML_TYPE_Q8_0,
|
| 610 |
#if defined (__ARM_FEATURE_MATMUL_INT8)
|
|
|
|
| 620 |
.is_quantized = true,
|
| 621 |
.to_float = (ggml_to_float_t) dequantize_row_q4_1,
|
| 622 |
.from_float = quantize_row_q4_1,
|
| 623 |
+
.from_float_ref = (ggml_from_float_t) quantize_row_q4_1_ref,
|
| 624 |
.vec_dot = ggml_vec_dot_q4_1_q8_1,
|
| 625 |
.vec_dot_type = GGML_TYPE_Q8_1,
|
| 626 |
#if defined (__ARM_FEATURE_MATMUL_INT8)
|
|
|
|
| 636 |
.is_quantized = false,
|
| 637 |
.to_float = NULL,
|
| 638 |
.from_float = NULL,
|
| 639 |
+
.from_float_ref = NULL,
|
| 640 |
.vec_dot = NULL,
|
| 641 |
.vec_dot_type = GGML_TYPE_COUNT,
|
| 642 |
.nrows = 1,
|
|
|
|
| 648 |
.is_quantized = false,
|
| 649 |
.to_float = NULL,
|
| 650 |
.from_float = NULL,
|
| 651 |
+
.from_float_ref = NULL,
|
| 652 |
.vec_dot = NULL,
|
| 653 |
.vec_dot_type = GGML_TYPE_COUNT,
|
| 654 |
.nrows = 1,
|
|
|
|
| 660 |
.is_quantized = true,
|
| 661 |
.to_float = (ggml_to_float_t) dequantize_row_q5_0,
|
| 662 |
.from_float = quantize_row_q5_0,
|
| 663 |
+
.from_float_ref = (ggml_from_float_t) quantize_row_q5_0_ref,
|
| 664 |
.vec_dot = ggml_vec_dot_q5_0_q8_0,
|
| 665 |
.vec_dot_type = GGML_TYPE_Q8_0,
|
| 666 |
.nrows = 1,
|
|
|
|
| 672 |
.is_quantized = true,
|
| 673 |
.to_float = (ggml_to_float_t) dequantize_row_q5_1,
|
| 674 |
.from_float = quantize_row_q5_1,
|
| 675 |
+
.from_float_ref = (ggml_from_float_t) quantize_row_q5_1_ref,
|
| 676 |
.vec_dot = ggml_vec_dot_q5_1_q8_1,
|
| 677 |
.vec_dot_type = GGML_TYPE_Q8_1,
|
| 678 |
.nrows = 1,
|
|
|
|
| 684 |
.is_quantized = true,
|
| 685 |
.to_float = (ggml_to_float_t) dequantize_row_q8_0,
|
| 686 |
.from_float = quantize_row_q8_0,
|
| 687 |
+
.from_float_ref = (ggml_from_float_t) quantize_row_q8_0_ref,
|
| 688 |
+
.from_float_to_mat = quantize_mat_q8_0,
|
| 689 |
.vec_dot = ggml_vec_dot_q8_0_q8_0,
|
| 690 |
.vec_dot_type = GGML_TYPE_Q8_0,
|
| 691 |
#if defined (__ARM_FEATURE_MATMUL_INT8)
|
|
|
|
| 693 |
#else
|
| 694 |
.nrows = 1,
|
| 695 |
#endif
|
|
|
|
| 696 |
},
|
| 697 |
[GGML_TYPE_Q8_1] = {
|
| 698 |
.type_name = "q8_1",
|
|
|
|
| 700 |
.type_size = sizeof(block_q8_1),
|
| 701 |
.is_quantized = true,
|
| 702 |
.from_float = quantize_row_q8_1,
|
| 703 |
+
.from_float_ref = (ggml_from_float_t) quantize_row_q8_1_ref,
|
| 704 |
.vec_dot_type = GGML_TYPE_Q8_1,
|
| 705 |
.nrows = 1,
|
| 706 |
},
|
|
|
|
| 711 |
.is_quantized = true,
|
| 712 |
.to_float = (ggml_to_float_t) dequantize_row_q2_K,
|
| 713 |
.from_float = quantize_row_q2_K,
|
| 714 |
+
.from_float_ref = (ggml_from_float_t) quantize_row_q2_K_ref,
|
| 715 |
.vec_dot = ggml_vec_dot_q2_K_q8_K,
|
| 716 |
.vec_dot_type = GGML_TYPE_Q8_K,
|
| 717 |
.nrows = 1,
|
|
|
|
| 723 |
.is_quantized = true,
|
| 724 |
.to_float = (ggml_to_float_t) dequantize_row_q3_K,
|
| 725 |
.from_float = quantize_row_q3_K,
|
| 726 |
+
.from_float_ref = (ggml_from_float_t) quantize_row_q3_K_ref,
|
| 727 |
.vec_dot = ggml_vec_dot_q3_K_q8_K,
|
| 728 |
.vec_dot_type = GGML_TYPE_Q8_K,
|
| 729 |
.nrows = 1,
|
|
|
|
| 735 |
.is_quantized = true,
|
| 736 |
.to_float = (ggml_to_float_t) dequantize_row_q4_K,
|
| 737 |
.from_float = quantize_row_q4_K,
|
| 738 |
+
.from_float_ref = (ggml_from_float_t) quantize_row_q4_K_ref,
|
| 739 |
.vec_dot = ggml_vec_dot_q4_K_q8_K,
|
| 740 |
.vec_dot_type = GGML_TYPE_Q8_K,
|
| 741 |
.nrows = 1,
|
|
|
|
| 747 |
.is_quantized = true,
|
| 748 |
.to_float = (ggml_to_float_t) dequantize_row_q5_K,
|
| 749 |
.from_float = quantize_row_q5_K,
|
| 750 |
+
.from_float_ref = (ggml_from_float_t) quantize_row_q5_K_ref,
|
| 751 |
.vec_dot = ggml_vec_dot_q5_K_q8_K,
|
| 752 |
.vec_dot_type = GGML_TYPE_Q8_K,
|
| 753 |
.nrows = 1,
|
|
|
|
| 759 |
.is_quantized = true,
|
| 760 |
.to_float = (ggml_to_float_t) dequantize_row_q6_K,
|
| 761 |
.from_float = quantize_row_q6_K,
|
| 762 |
+
.from_float_ref = (ggml_from_float_t) quantize_row_q6_K_ref,
|
| 763 |
.vec_dot = ggml_vec_dot_q6_K_q8_K,
|
| 764 |
.vec_dot_type = GGML_TYPE_Q8_K,
|
| 765 |
.nrows = 1,
|
|
|
|
| 771 |
.is_quantized = true,
|
| 772 |
.to_float = (ggml_to_float_t) dequantize_row_iq2_xxs,
|
| 773 |
.from_float = NULL,
|
| 774 |
+
.from_float_ref = NULL,
|
| 775 |
.vec_dot = ggml_vec_dot_iq2_xxs_q8_K,
|
| 776 |
.vec_dot_type = GGML_TYPE_Q8_K,
|
| 777 |
.nrows = 1,
|
|
|
|
| 783 |
.is_quantized = true,
|
| 784 |
.to_float = (ggml_to_float_t) dequantize_row_iq2_xs,
|
| 785 |
.from_float = NULL,
|
| 786 |
+
.from_float_ref = NULL,
|
| 787 |
.vec_dot = ggml_vec_dot_iq2_xs_q8_K,
|
| 788 |
.vec_dot_type = GGML_TYPE_Q8_K,
|
| 789 |
.nrows = 1,
|
|
|
|
| 795 |
.is_quantized = true,
|
| 796 |
.to_float = (ggml_to_float_t) dequantize_row_iq3_xxs,
|
| 797 |
.from_float = quantize_row_iq3_xxs,
|
| 798 |
+
.from_float_ref = (ggml_from_float_t)quantize_row_iq3_xxs_ref,
|
| 799 |
.vec_dot = ggml_vec_dot_iq3_xxs_q8_K,
|
| 800 |
.vec_dot_type = GGML_TYPE_Q8_K,
|
| 801 |
.nrows = 1,
|
|
|
|
| 807 |
.is_quantized = true,
|
| 808 |
.to_float = (ggml_to_float_t) dequantize_row_iq3_s,
|
| 809 |
.from_float = quantize_row_iq3_s,
|
| 810 |
+
.from_float_ref = (ggml_from_float_t)quantize_row_iq3_s_ref,
|
| 811 |
.vec_dot = ggml_vec_dot_iq3_s_q8_K,
|
| 812 |
.vec_dot_type = GGML_TYPE_Q8_K,
|
| 813 |
.nrows = 1,
|
|
|
|
| 819 |
.is_quantized = true,
|
| 820 |
.to_float = (ggml_to_float_t) dequantize_row_iq2_s,
|
| 821 |
.from_float = quantize_row_iq2_s,
|
| 822 |
+
.from_float_ref = (ggml_from_float_t)quantize_row_iq2_s_ref,
|
| 823 |
.vec_dot = ggml_vec_dot_iq2_s_q8_K,
|
| 824 |
.vec_dot_type = GGML_TYPE_Q8_K,
|
| 825 |
.nrows = 1,
|
|
|
|
| 831 |
.is_quantized = true,
|
| 832 |
.to_float = (ggml_to_float_t) dequantize_row_iq1_s,
|
| 833 |
.from_float = NULL,
|
| 834 |
+
.from_float_ref = NULL,
|
| 835 |
.vec_dot = ggml_vec_dot_iq1_s_q8_K,
|
| 836 |
.vec_dot_type = GGML_TYPE_Q8_K,
|
| 837 |
.nrows = 1,
|
|
|
|
| 843 |
.is_quantized = true,
|
| 844 |
.to_float = (ggml_to_float_t) dequantize_row_iq1_m,
|
| 845 |
.from_float = NULL,
|
| 846 |
+
.from_float_ref = NULL,
|
| 847 |
.vec_dot = ggml_vec_dot_iq1_m_q8_K,
|
| 848 |
.vec_dot_type = GGML_TYPE_Q8_K,
|
| 849 |
.nrows = 1,
|
|
|
|
| 855 |
.is_quantized = true,
|
| 856 |
.to_float = (ggml_to_float_t) dequantize_row_iq4_nl,
|
| 857 |
.from_float = quantize_row_iq4_nl,
|
| 858 |
+
.from_float_ref = (ggml_from_float_t)quantize_row_iq4_nl_ref,
|
| 859 |
.vec_dot = ggml_vec_dot_iq4_nl_q8_0,
|
| 860 |
.vec_dot_type = GGML_TYPE_Q8_0,
|
| 861 |
.nrows = 1,
|
|
|
|
| 867 |
.is_quantized = true,
|
| 868 |
.to_float = (ggml_to_float_t) dequantize_row_iq4_xs,
|
| 869 |
.from_float = quantize_row_iq4_xs,
|
| 870 |
+
.from_float_ref = (ggml_from_float_t)quantize_row_iq4_xs_ref,
|
| 871 |
.vec_dot = ggml_vec_dot_iq4_xs_q8_K,
|
| 872 |
.vec_dot_type = GGML_TYPE_Q8_K,
|
| 873 |
.nrows = 1,
|
|
|
|
| 886 |
.is_quantized = false,
|
| 887 |
.to_float = (ggml_to_float_t) ggml_bf16_to_fp32_row,
|
| 888 |
.from_float = (ggml_from_float_t) ggml_fp32_to_bf16_row,
|
| 889 |
+
.from_float_ref = (ggml_from_float_t) ggml_fp32_to_bf16_row,
|
| 890 |
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_bf16,
|
| 891 |
.vec_dot_type = GGML_TYPE_BF16,
|
| 892 |
.nrows = 1,
|
|
|
|
| 894 |
[GGML_TYPE_Q4_0_4_4] = {
|
| 895 |
.type_name = "q4_0_4x4",
|
| 896 |
.blck_size = QK4_0,
|
| 897 |
+
.blck_size_interleave = 4,
|
| 898 |
.type_size = sizeof(block_q4_0),
|
| 899 |
.is_quantized = true,
|
| 900 |
.to_float = NULL,
|
| 901 |
.from_float = NULL,
|
| 902 |
+
.from_float_ref = NULL,
|
| 903 |
.vec_dot = NULL,
|
| 904 |
.vec_dot_type = GGML_TYPE_Q8_0,
|
| 905 |
.nrows = 1,
|
| 906 |
.ncols = 4,
|
|
|
|
| 907 |
.gemv = ggml_gemv_q4_0_4x4_q8_0,
|
| 908 |
.gemm = ggml_gemm_q4_0_4x4_q8_0,
|
| 909 |
},
|
| 910 |
[GGML_TYPE_Q4_0_4_8] = {
|
| 911 |
.type_name = "q4_0_4x8",
|
| 912 |
.blck_size = QK4_0,
|
| 913 |
+
.blck_size_interleave = 8,
|
| 914 |
.type_size = sizeof(block_q4_0),
|
| 915 |
.is_quantized = true,
|
| 916 |
.to_float = NULL,
|
| 917 |
.from_float = NULL,
|
| 918 |
+
.from_float_ref = NULL,
|
| 919 |
.vec_dot = NULL,
|
| 920 |
.vec_dot_type = GGML_TYPE_Q8_0,
|
| 921 |
.nrows = 1,
|
| 922 |
.ncols = 4,
|
|
|
|
| 923 |
.gemv = ggml_gemv_q4_0_4x8_q8_0,
|
| 924 |
.gemm = ggml_gemm_q4_0_4x8_q8_0,
|
| 925 |
},
|
| 926 |
[GGML_TYPE_Q4_0_8_8] = {
|
| 927 |
.type_name = "q4_0_8x8",
|
| 928 |
.blck_size = QK4_0,
|
| 929 |
+
.blck_size_interleave = 8,
|
| 930 |
.type_size = sizeof(block_q4_0),
|
| 931 |
.is_quantized = true,
|
| 932 |
.to_float = NULL,
|
| 933 |
.from_float = NULL,
|
| 934 |
+
.from_float_ref = NULL,
|
| 935 |
.vec_dot = NULL,
|
| 936 |
.vec_dot_type = GGML_TYPE_Q8_0,
|
| 937 |
.nrows = 1,
|
| 938 |
.ncols = 8,
|
|
|
|
| 939 |
.gemv = ggml_gemv_q4_0_8x8_q8_0,
|
| 940 |
.gemm = ggml_gemm_q4_0_8x8_q8_0,
|
| 941 |
}
|
|
|
|
| 3115 |
return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
|
| 3116 |
}
|
| 3117 |
|
| 3118 |
+
GGML_CALL int64_t ggml_blck_size(enum ggml_type type) {
|
| 3119 |
return type_traits[type].blck_size;
|
| 3120 |
}
|
| 3121 |
|
|
|
|
| 12192 |
|
| 12193 |
const enum ggml_type type = src0->type;
|
| 12194 |
|
| 12195 |
+
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
|
| 12196 |
+
ggml_from_float_t const from_float = type_traits[vec_dot_type].from_float;
|
| 12197 |
+
ggml_from_float_to_mat_t const from_float_to_mat = type_traits[vec_dot_type].from_float_to_mat;
|
| 12198 |
+
int64_t const vec_dot_num_rows = type_traits[type].nrows;
|
| 12199 |
+
int64_t const matmul_num_cols = type_traits[type].ncols;
|
| 12200 |
+
int64_t const blck_size_interleave = type_traits[type].blck_size_interleave;
|
| 12201 |
+
ggml_gemv_t const gemv = type_traits[type].gemv;
|
| 12202 |
+
ggml_gemm_t const gemm = type_traits[type].gemm;
|
|
|
|
| 12203 |
|
| 12204 |
GGML_ASSERT(ne0 == ne01);
|
| 12205 |
GGML_ASSERT(ne1 == ne11);
|
|
|
|
| 12263 |
for (int64_t i11 = ith * 4; i11 < ne11 - ne11 % 4; i11 += nth * 4) {
|
| 12264 |
from_float_to_mat((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
|
| 12265 |
(void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
|
| 12266 |
+
4, ne10, blck_size_interleave);
|
| 12267 |
}
|
| 12268 |
i11_processed = ne11 - ne11 % 4;
|
| 12269 |
}
|
| 12270 |
for (int64_t i11 = i11_processed + ith; i11 < ne11; i11 += nth) {
|
| 12271 |
+
from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
|
| 12272 |
+
(void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
|
| 12273 |
+
ne10);
|
| 12274 |
}
|
| 12275 |
}
|
| 12276 |
}
|
|
|
|
| 12354 |
int64_t src0_start = (ith * ne01) / nth;
|
| 12355 |
int64_t src0_end = ((ith + 1) * ne01) / nth;
|
| 12356 |
src0_start = (src0_start % matmul_num_cols) ? src0_start + matmul_num_cols - (src0_start % matmul_num_cols): src0_start;
|
| 12357 |
+
src0_end = (src0_end % matmul_num_cols) ? src0_end + matmul_num_cols - (src0_end % matmul_num_cols): src0_end;
|
| 12358 |
if (src0_start >= src0_end) return;
|
| 12359 |
|
| 12360 |
// If there are more than three rows in src1, use gemm; otherwise, use gemv.
|
|
|
|
| 12412 |
|
| 12413 |
const bool src1_cont = ggml_is_contiguous(src1);
|
| 12414 |
|
| 12415 |
+
ggml_vec_dot_t const vec_dot = type_traits[type].vec_dot;
|
| 12416 |
+
enum ggml_type const vec_dot_type = type_traits[type].vec_dot_type;
|
| 12417 |
+
ggml_from_float_t const from_float = type_traits[vec_dot_type].from_float;
|
| 12418 |
+
int64_t const matmul_num_cols = type_traits[type].ncols;
|
| 12419 |
+
ggml_gemv_t const gemv = type_traits[type].gemv;
|
| 12420 |
|
| 12421 |
// we don't support permuted src0 or src1
|
| 12422 |
GGML_ASSERT(nb00 == ggml_type_size(type));
|
|
|
|
| 12457 |
for (int64_t i13 = 0; i13 < ne13; ++i13) {
|
| 12458 |
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
| 12459 |
for (int64_t i11 = ith; i11 < ne11; i11 += nth) {
|
| 12460 |
+
from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11),
|
| 12461 |
+
(void *) (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1),
|
| 12462 |
+
ne10);
|
| 12463 |
}
|
| 12464 |
}
|
| 12465 |
}
|
|
|
|
| 21061 |
(int64_t) info->ne[3];
|
| 21062 |
|
| 21063 |
if (ne % ggml_blck_size(info->type) != 0) {
|
| 21064 |
+
fprintf(stderr, "%s: tensor '%s' of type %d (%s) number of elements (%" PRId64 ") is not a multiple of block size (%" PRId64 ")\n",
|
| 21065 |
+
__func__, info->name.data, (int) info->type, ggml_type_name(info->type), ne, ggml_blck_size(info->type));
|
| 21066 |
fclose(file);
|
| 21067 |
gguf_free(ctx);
|
| 21068 |
return NULL;
|