Spaces:
Running
Running
ggml : fix loongson compile warnings (llama/7537)
Browse files* ggml : fix loongson compile warnings
ggml-ci
* Fix loongarch quantize test fail.
Fix unexpected error introduced during rebase code.
* tests : disable json test due to lack of python on the CI node
ggml-ci
---------
Co-authored-by: junchao-loongson <[email protected]>
- ggml-quants.c +23 -3
- ggml.c +7 -6
ggml-quants.c
CHANGED
|
@@ -6088,6 +6088,7 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
| 6088 |
|
| 6089 |
const uint8_t * restrict q2 = x[i].qs;
|
| 6090 |
const int8_t * restrict q8 = y[i].qs;
|
|
|
|
| 6091 |
const __m128i mins_and_scales = __lsx_vld((const __m128i*)x[i].scales, 0);
|
| 6092 |
const __m128i scales8 = __lsx_vand_v(mins_and_scales, m4);
|
| 6093 |
const __m128i mins8 = __lsx_vand_v(__lsx_vsrli_h(mins_and_scales, 4), m4);
|
|
@@ -6807,6 +6808,8 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
| 6807 |
for (int i = 0; i < nb; ++i) {
|
| 6808 |
|
| 6809 |
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
|
|
|
|
|
|
| 6810 |
// Set up scales
|
| 6811 |
memcpy(aux, x[i].scales, 12);
|
| 6812 |
__m128i scales128 = lsx_set_w(
|
|
@@ -6830,8 +6833,6 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
| 6830 |
int is = 0;
|
| 6831 |
__m256i xvbit;
|
| 6832 |
|
| 6833 |
-
const uint8_t * restrict q3 = x[i].qs;
|
| 6834 |
-
const int8_t * restrict q8 = y[i].qs;
|
| 6835 |
|
| 6836 |
for (int j = 0; j < QK_K/128; ++j) {
|
| 6837 |
// load low 2 bits
|
|
@@ -7404,6 +7405,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
| 7404 |
*s = vec_extract(vsumf0, 0);
|
| 7405 |
|
| 7406 |
#elif defined __loongarch_asx
|
|
|
|
|
|
|
|
|
|
| 7407 |
|
| 7408 |
const __m256i m4 = __lasx_xvreplgr2vr_b(0xF);
|
| 7409 |
|
|
@@ -7416,6 +7420,11 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
| 7416 |
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
| 7417 |
|
| 7418 |
memcpy(utmp, x[i].scales, 12);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7419 |
|
| 7420 |
const uint8_t * restrict q4 = x[i].qs;
|
| 7421 |
const int8_t * restrict q8 = y[i].qs;
|
|
@@ -7455,16 +7464,17 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
| 7455 |
|
| 7456 |
__m256 vd = __lasx_xvreplfr2vr_s(d);
|
| 7457 |
acc = __lasx_xvfmadd_s(vd, __lasx_xvffint_s_w(sumi), acc);
|
|
|
|
| 7458 |
}
|
| 7459 |
|
| 7460 |
acc_m = __lsx_vfadd_s(acc_m, (__m128)__lsx_vpermi_w((__m128i)acc_m, (__m128i)acc_m, 0xee));
|
| 7461 |
__m128i tmp1 = __lsx_vinsgr2vr_w(__lsx_vldi(0), __lsx_vpickve2gr_w((__m128i)acc_m, 1), 0);
|
| 7462 |
acc_m = __lsx_vfadd_s(acc_m, (__m128)tmp1);
|
| 7463 |
|
|
|
|
| 7464 |
ft_union fi;
|
| 7465 |
fi.i = __lsx_vpickve2gr_w(acc_m, 0);
|
| 7466 |
*s = hsum_float_8(acc) + fi.f ;
|
| 7467 |
-
|
| 7468 |
#else
|
| 7469 |
|
| 7470 |
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
|
@@ -8002,6 +8012,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
| 8002 |
*s = vec_extract(vsumf0, 0);
|
| 8003 |
|
| 8004 |
#elif defined __loongarch_asx
|
|
|
|
|
|
|
|
|
|
| 8005 |
|
| 8006 |
const __m256i m4 = __lasx_xvreplgr2vr_b(0xF);
|
| 8007 |
const __m128i mzero = __lsx_vldi(0);
|
|
@@ -8020,6 +8033,11 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
| 8020 |
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
| 8021 |
|
| 8022 |
memcpy(utmp, x[i].scales, 12);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8023 |
|
| 8024 |
const __m256i mins_and_scales = lasx_extu8_16(lsx_set_w(utmp[3], utmp[2], utmp[1], utmp[0]));
|
| 8025 |
|
|
@@ -8069,10 +8087,12 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
|
|
| 8069 |
p16_1 = lasx_madd_h(scale_1, p16_1);
|
| 8070 |
|
| 8071 |
sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p16_0, p16_1));
|
|
|
|
| 8072 |
}
|
| 8073 |
|
| 8074 |
__m256 vd = __lasx_xvreplfr2vr_s(d);
|
| 8075 |
acc = __lasx_xvfmadd_s(vd, __lasx_xvffint_s_w(sumi), acc);
|
|
|
|
| 8076 |
}
|
| 8077 |
|
| 8078 |
*s = hsum_float_8(acc) + summs;
|
|
|
|
| 6088 |
|
| 6089 |
const uint8_t * restrict q2 = x[i].qs;
|
| 6090 |
const int8_t * restrict q8 = y[i].qs;
|
| 6091 |
+
|
| 6092 |
const __m128i mins_and_scales = __lsx_vld((const __m128i*)x[i].scales, 0);
|
| 6093 |
const __m128i scales8 = __lsx_vand_v(mins_and_scales, m4);
|
| 6094 |
const __m128i mins8 = __lsx_vand_v(__lsx_vsrli_h(mins_and_scales, 4), m4);
|
|
|
|
| 6808 |
for (int i = 0; i < nb; ++i) {
|
| 6809 |
|
| 6810 |
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
| 6811 |
+
const uint8_t * restrict q3 = x[i].qs;
|
| 6812 |
+
const int8_t * restrict q8 = y[i].qs;
|
| 6813 |
// Set up scales
|
| 6814 |
memcpy(aux, x[i].scales, 12);
|
| 6815 |
__m128i scales128 = lsx_set_w(
|
|
|
|
| 6833 |
int is = 0;
|
| 6834 |
__m256i xvbit;
|
| 6835 |
|
|
|
|
|
|
|
| 6836 |
|
| 6837 |
for (int j = 0; j < QK_K/128; ++j) {
|
| 6838 |
// load low 2 bits
|
|
|
|
| 7405 |
*s = vec_extract(vsumf0, 0);
|
| 7406 |
|
| 7407 |
#elif defined __loongarch_asx
|
| 7408 |
+
GGML_UNUSED(kmask1);
|
| 7409 |
+
GGML_UNUSED(kmask2);
|
| 7410 |
+
GGML_UNUSED(kmask3);
|
| 7411 |
|
| 7412 |
const __m256i m4 = __lasx_xvreplgr2vr_b(0xF);
|
| 7413 |
|
|
|
|
| 7420 |
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
| 7421 |
|
| 7422 |
memcpy(utmp, x[i].scales, 12);
|
| 7423 |
+
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
| 7424 |
+
const uint32_t uaux = utmp[1] & kmask1;
|
| 7425 |
+
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
| 7426 |
+
utmp[2] = uaux;
|
| 7427 |
+
utmp[0] &= kmask1;
|
| 7428 |
|
| 7429 |
const uint8_t * restrict q4 = x[i].qs;
|
| 7430 |
const int8_t * restrict q8 = y[i].qs;
|
|
|
|
| 7464 |
|
| 7465 |
__m256 vd = __lasx_xvreplfr2vr_s(d);
|
| 7466 |
acc = __lasx_xvfmadd_s(vd, __lasx_xvffint_s_w(sumi), acc);
|
| 7467 |
+
|
| 7468 |
}
|
| 7469 |
|
| 7470 |
acc_m = __lsx_vfadd_s(acc_m, (__m128)__lsx_vpermi_w((__m128i)acc_m, (__m128i)acc_m, 0xee));
|
| 7471 |
__m128i tmp1 = __lsx_vinsgr2vr_w(__lsx_vldi(0), __lsx_vpickve2gr_w((__m128i)acc_m, 1), 0);
|
| 7472 |
acc_m = __lsx_vfadd_s(acc_m, (__m128)tmp1);
|
| 7473 |
|
| 7474 |
+
|
| 7475 |
ft_union fi;
|
| 7476 |
fi.i = __lsx_vpickve2gr_w(acc_m, 0);
|
| 7477 |
*s = hsum_float_8(acc) + fi.f ;
|
|
|
|
| 7478 |
#else
|
| 7479 |
|
| 7480 |
const uint8_t * scales = (const uint8_t*)&utmp[0];
|
|
|
|
| 8012 |
*s = vec_extract(vsumf0, 0);
|
| 8013 |
|
| 8014 |
#elif defined __loongarch_asx
|
| 8015 |
+
GGML_UNUSED(kmask1);
|
| 8016 |
+
GGML_UNUSED(kmask2);
|
| 8017 |
+
GGML_UNUSED(kmask3);
|
| 8018 |
|
| 8019 |
const __m256i m4 = __lasx_xvreplgr2vr_b(0xF);
|
| 8020 |
const __m128i mzero = __lsx_vldi(0);
|
|
|
|
| 8033 |
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
| 8034 |
|
| 8035 |
memcpy(utmp, x[i].scales, 12);
|
| 8036 |
+
utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
|
| 8037 |
+
const uint32_t uaux = utmp[1] & kmask1;
|
| 8038 |
+
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
| 8039 |
+
utmp[2] = uaux;
|
| 8040 |
+
utmp[0] &= kmask1;
|
| 8041 |
|
| 8042 |
const __m256i mins_and_scales = lasx_extu8_16(lsx_set_w(utmp[3], utmp[2], utmp[1], utmp[0]));
|
| 8043 |
|
|
|
|
| 8087 |
p16_1 = lasx_madd_h(scale_1, p16_1);
|
| 8088 |
|
| 8089 |
sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p16_0, p16_1));
|
| 8090 |
+
|
| 8091 |
}
|
| 8092 |
|
| 8093 |
__m256 vd = __lasx_xvreplfr2vr_s(d);
|
| 8094 |
acc = __lasx_xvfmadd_s(vd, __lasx_xvffint_s_w(sumi), acc);
|
| 8095 |
+
|
| 8096 |
}
|
| 8097 |
|
| 8098 |
*s = hsum_float_8(acc) + summs;
|
ggml.c
CHANGED
|
@@ -1576,11 +1576,11 @@ do { \
|
|
| 1576 |
|
| 1577 |
// F16 arithmetic is not supported by AVX, so we use F32 instead
|
| 1578 |
|
| 1579 |
-
#define GGML_F32Cx8
|
| 1580 |
#define GGML_F32Cx8_ZERO (__m256)__lasx_xvldi(0)
|
| 1581 |
#define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x))
|
| 1582 |
|
| 1583 |
-
static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t *x) {
|
| 1584 |
float tmp[8];
|
| 1585 |
|
| 1586 |
for (int i = 0; i < 8; i++) {
|
|
@@ -1589,13 +1589,14 @@ static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t *x) {
|
|
| 1589 |
|
| 1590 |
return (__m256)__lasx_xvld(tmp, 0);
|
| 1591 |
}
|
| 1592 |
-
static inline void __lasx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
|
| 1593 |
float arr[8];
|
| 1594 |
|
| 1595 |
__lasx_xvst(y, arr, 0);
|
| 1596 |
|
| 1597 |
-
for (int i = 0; i < 8; i++)
|
| 1598 |
x[i] = GGML_FP32_TO_FP16(arr[i]);
|
|
|
|
| 1599 |
}
|
| 1600 |
#define GGML_F32Cx8_LOAD(x) __lasx_f32cx8_load(x)
|
| 1601 |
#define GGML_F32Cx8_STORE(x, y) __lasx_f32cx8_store(x, y)
|
|
@@ -1671,7 +1672,7 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
|
|
| 1671 |
#define GGML_F16_STEP 32
|
| 1672 |
#define GGML_F16_EPR 4
|
| 1673 |
|
| 1674 |
-
static inline __m128 __lsx_f16x4_load(ggml_fp16_t *x) {
|
| 1675 |
float tmp[4];
|
| 1676 |
|
| 1677 |
tmp[0] = GGML_FP16_TO_FP32(x[0]);
|
|
@@ -1682,7 +1683,7 @@ static inline __m128 __lsx_f16x4_load(ggml_fp16_t *x) {
|
|
| 1682 |
return __lsx_vld(tmp, 0);
|
| 1683 |
}
|
| 1684 |
|
| 1685 |
-
static inline void __lsx_f16x4_store(ggml_fp16_t *x, __m128 y) {
|
| 1686 |
float arr[4];
|
| 1687 |
|
| 1688 |
__lsx_vst(y, arr, 0);
|
|
|
|
| 1576 |
|
| 1577 |
// F16 arithmetic is not supported by AVX, so we use F32 instead
|
| 1578 |
|
| 1579 |
+
#define GGML_F32Cx8 __m256
|
| 1580 |
#define GGML_F32Cx8_ZERO (__m256)__lasx_xvldi(0)
|
| 1581 |
#define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x))
|
| 1582 |
|
| 1583 |
+
static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t * x) {
|
| 1584 |
float tmp[8];
|
| 1585 |
|
| 1586 |
for (int i = 0; i < 8; i++) {
|
|
|
|
| 1589 |
|
| 1590 |
return (__m256)__lasx_xvld(tmp, 0);
|
| 1591 |
}
|
| 1592 |
+
static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
|
| 1593 |
float arr[8];
|
| 1594 |
|
| 1595 |
__lasx_xvst(y, arr, 0);
|
| 1596 |
|
| 1597 |
+
for (int i = 0; i < 8; i++) {
|
| 1598 |
x[i] = GGML_FP32_TO_FP16(arr[i]);
|
| 1599 |
+
}
|
| 1600 |
}
|
| 1601 |
#define GGML_F32Cx8_LOAD(x) __lasx_f32cx8_load(x)
|
| 1602 |
#define GGML_F32Cx8_STORE(x, y) __lasx_f32cx8_store(x, y)
|
|
|
|
| 1672 |
#define GGML_F16_STEP 32
|
| 1673 |
#define GGML_F16_EPR 4
|
| 1674 |
|
| 1675 |
+
static inline __m128 __lsx_f16x4_load(const ggml_fp16_t * x) {
|
| 1676 |
float tmp[4];
|
| 1677 |
|
| 1678 |
tmp[0] = GGML_FP16_TO_FP32(x[0]);
|
|
|
|
| 1683 |
return __lsx_vld(tmp, 0);
|
| 1684 |
}
|
| 1685 |
|
| 1686 |
+
static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
|
| 1687 |
float arr[4];
|
| 1688 |
|
| 1689 |
__lsx_vst(y, arr, 0);
|