ggerganov junchao-loongson commited on
Commit
c1442f3
·
1 Parent(s): 6dbbbab

ggml : fix loongson compile warnings (llama/7537)

Browse files

* ggml : fix loongson compile warnings

ggml-ci

* Fix loongarch quantize test fail.

Fix unexpected error introduced during rebase code.

* tests : disable json test due to lack of python on the CI node

ggml-ci

---------

Co-authored-by: junchao-loongson <[email protected]>

Files changed (2) hide show
  1. ggml-quants.c +23 -3
  2. ggml.c +7 -6
ggml-quants.c CHANGED
@@ -6088,6 +6088,7 @@ void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6088
 
6089
  const uint8_t * restrict q2 = x[i].qs;
6090
  const int8_t * restrict q8 = y[i].qs;
 
6091
  const __m128i mins_and_scales = __lsx_vld((const __m128i*)x[i].scales, 0);
6092
  const __m128i scales8 = __lsx_vand_v(mins_and_scales, m4);
6093
  const __m128i mins8 = __lsx_vand_v(__lsx_vsrli_h(mins_and_scales, 4), m4);
@@ -6807,6 +6808,8 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6807
  for (int i = 0; i < nb; ++i) {
6808
 
6809
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
 
 
6810
  // Set up scales
6811
  memcpy(aux, x[i].scales, 12);
6812
  __m128i scales128 = lsx_set_w(
@@ -6830,8 +6833,6 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void * r
6830
  int is = 0;
6831
  __m256i xvbit;
6832
 
6833
- const uint8_t * restrict q3 = x[i].qs;
6834
- const int8_t * restrict q8 = y[i].qs;
6835
 
6836
  for (int j = 0; j < QK_K/128; ++j) {
6837
  // load low 2 bits
@@ -7404,6 +7405,9 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7404
  *s = vec_extract(vsumf0, 0);
7405
 
7406
  #elif defined __loongarch_asx
 
 
 
7407
 
7408
  const __m256i m4 = __lasx_xvreplgr2vr_b(0xF);
7409
 
@@ -7416,6 +7420,11 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7416
  const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
7417
 
7418
  memcpy(utmp, x[i].scales, 12);
 
 
 
 
 
7419
 
7420
  const uint8_t * restrict q4 = x[i].qs;
7421
  const int8_t * restrict q8 = y[i].qs;
@@ -7455,16 +7464,17 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, size_t bs, const void * r
7455
 
7456
  __m256 vd = __lasx_xvreplfr2vr_s(d);
7457
  acc = __lasx_xvfmadd_s(vd, __lasx_xvffint_s_w(sumi), acc);
 
7458
  }
7459
 
7460
  acc_m = __lsx_vfadd_s(acc_m, (__m128)__lsx_vpermi_w((__m128i)acc_m, (__m128i)acc_m, 0xee));
7461
  __m128i tmp1 = __lsx_vinsgr2vr_w(__lsx_vldi(0), __lsx_vpickve2gr_w((__m128i)acc_m, 1), 0);
7462
  acc_m = __lsx_vfadd_s(acc_m, (__m128)tmp1);
7463
 
 
7464
  ft_union fi;
7465
  fi.i = __lsx_vpickve2gr_w(acc_m, 0);
7466
  *s = hsum_float_8(acc) + fi.f ;
7467
-
7468
  #else
7469
 
7470
  const uint8_t * scales = (const uint8_t*)&utmp[0];
@@ -8002,6 +8012,9 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
8002
  *s = vec_extract(vsumf0, 0);
8003
 
8004
  #elif defined __loongarch_asx
 
 
 
8005
 
8006
  const __m256i m4 = __lasx_xvreplgr2vr_b(0xF);
8007
  const __m128i mzero = __lsx_vldi(0);
@@ -8020,6 +8033,11 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
8020
  const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
8021
 
8022
  memcpy(utmp, x[i].scales, 12);
 
 
 
 
 
8023
 
8024
  const __m256i mins_and_scales = lasx_extu8_16(lsx_set_w(utmp[3], utmp[2], utmp[1], utmp[0]));
8025
 
@@ -8069,10 +8087,12 @@ void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void * r
8069
  p16_1 = lasx_madd_h(scale_1, p16_1);
8070
 
8071
  sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p16_0, p16_1));
 
8072
  }
8073
 
8074
  __m256 vd = __lasx_xvreplfr2vr_s(d);
8075
  acc = __lasx_xvfmadd_s(vd, __lasx_xvffint_s_w(sumi), acc);
 
8076
  }
8077
 
8078
  *s = hsum_float_8(acc) + summs;
 
6088
 
6089
  const uint8_t * restrict q2 = x[i].qs;
6090
  const int8_t * restrict q8 = y[i].qs;
6091
+
6092
  const __m128i mins_and_scales = __lsx_vld((const __m128i*)x[i].scales, 0);
6093
  const __m128i scales8 = __lsx_vand_v(mins_and_scales, m4);
6094
  const __m128i mins8 = __lsx_vand_v(__lsx_vsrli_h(mins_and_scales, 4), m4);
 
6808
  for (int i = 0; i < nb; ++i) {
6809
 
6810
  const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
6811
+ const uint8_t * restrict q3 = x[i].qs;
6812
+ const int8_t * restrict q8 = y[i].qs;
6813
  // Set up scales
6814
  memcpy(aux, x[i].scales, 12);
6815
  __m128i scales128 = lsx_set_w(
 
6833
  int is = 0;
6834
  __m256i xvbit;
6835
 
 
 
6836
 
6837
  for (int j = 0; j < QK_K/128; ++j) {
6838
  // load low 2 bits
 
7405
  *s = vec_extract(vsumf0, 0);
7406
 
7407
  #elif defined __loongarch_asx
7408
+ GGML_UNUSED(kmask1);
7409
+ GGML_UNUSED(kmask2);
7410
+ GGML_UNUSED(kmask3);
7411
 
7412
  const __m256i m4 = __lasx_xvreplgr2vr_b(0xF);
7413
 
 
7420
  const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
7421
 
7422
  memcpy(utmp, x[i].scales, 12);
7423
+ utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
7424
+ const uint32_t uaux = utmp[1] & kmask1;
7425
+ utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
7426
+ utmp[2] = uaux;
7427
+ utmp[0] &= kmask1;
7428
 
7429
  const uint8_t * restrict q4 = x[i].qs;
7430
  const int8_t * restrict q8 = y[i].qs;
 
7464
 
7465
  __m256 vd = __lasx_xvreplfr2vr_s(d);
7466
  acc = __lasx_xvfmadd_s(vd, __lasx_xvffint_s_w(sumi), acc);
7467
+
7468
  }
7469
 
7470
  acc_m = __lsx_vfadd_s(acc_m, (__m128)__lsx_vpermi_w((__m128i)acc_m, (__m128i)acc_m, 0xee));
7471
  __m128i tmp1 = __lsx_vinsgr2vr_w(__lsx_vldi(0), __lsx_vpickve2gr_w((__m128i)acc_m, 1), 0);
7472
  acc_m = __lsx_vfadd_s(acc_m, (__m128)tmp1);
7473
 
7474
+
7475
  ft_union fi;
7476
  fi.i = __lsx_vpickve2gr_w(acc_m, 0);
7477
  *s = hsum_float_8(acc) + fi.f ;
 
7478
  #else
7479
 
7480
  const uint8_t * scales = (const uint8_t*)&utmp[0];
 
8012
  *s = vec_extract(vsumf0, 0);
8013
 
8014
  #elif defined __loongarch_asx
8015
+ GGML_UNUSED(kmask1);
8016
+ GGML_UNUSED(kmask2);
8017
+ GGML_UNUSED(kmask3);
8018
 
8019
  const __m256i m4 = __lasx_xvreplgr2vr_b(0xF);
8020
  const __m128i mzero = __lsx_vldi(0);
 
8033
  const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
8034
 
8035
  memcpy(utmp, x[i].scales, 12);
8036
+ utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4);
8037
+ const uint32_t uaux = utmp[1] & kmask1;
8038
+ utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
8039
+ utmp[2] = uaux;
8040
+ utmp[0] &= kmask1;
8041
 
8042
  const __m256i mins_and_scales = lasx_extu8_16(lsx_set_w(utmp[3], utmp[2], utmp[1], utmp[0]));
8043
 
 
8087
  p16_1 = lasx_madd_h(scale_1, p16_1);
8088
 
8089
  sumi = __lasx_xvadd_w(sumi, __lasx_xvadd_w(p16_0, p16_1));
8090
+
8091
  }
8092
 
8093
  __m256 vd = __lasx_xvreplfr2vr_s(d);
8094
  acc = __lasx_xvfmadd_s(vd, __lasx_xvffint_s_w(sumi), acc);
8095
+
8096
  }
8097
 
8098
  *s = hsum_float_8(acc) + summs;
ggml.c CHANGED
@@ -1576,11 +1576,11 @@ do { \
1576
 
1577
  // F16 arithmetic is not supported by AVX, so we use F32 instead
1578
 
1579
- #define GGML_F32Cx8 __m256
1580
  #define GGML_F32Cx8_ZERO (__m256)__lasx_xvldi(0)
1581
  #define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x))
1582
 
1583
- static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t *x) {
1584
  float tmp[8];
1585
 
1586
  for (int i = 0; i < 8; i++) {
@@ -1589,13 +1589,14 @@ static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t *x) {
1589
 
1590
  return (__m256)__lasx_xvld(tmp, 0);
1591
  }
1592
- static inline void __lasx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
1593
  float arr[8];
1594
 
1595
  __lasx_xvst(y, arr, 0);
1596
 
1597
- for (int i = 0; i < 8; i++)
1598
  x[i] = GGML_FP32_TO_FP16(arr[i]);
 
1599
  }
1600
  #define GGML_F32Cx8_LOAD(x) __lasx_f32cx8_load(x)
1601
  #define GGML_F32Cx8_STORE(x, y) __lasx_f32cx8_store(x, y)
@@ -1671,7 +1672,7 @@ static inline void __lasx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
1671
  #define GGML_F16_STEP 32
1672
  #define GGML_F16_EPR 4
1673
 
1674
- static inline __m128 __lsx_f16x4_load(ggml_fp16_t *x) {
1675
  float tmp[4];
1676
 
1677
  tmp[0] = GGML_FP16_TO_FP32(x[0]);
@@ -1682,7 +1683,7 @@ static inline __m128 __lsx_f16x4_load(ggml_fp16_t *x) {
1682
  return __lsx_vld(tmp, 0);
1683
  }
1684
 
1685
- static inline void __lsx_f16x4_store(ggml_fp16_t *x, __m128 y) {
1686
  float arr[4];
1687
 
1688
  __lsx_vst(y, arr, 0);
 
1576
 
1577
  // F16 arithmetic is not supported by AVX, so we use F32 instead
1578
 
1579
+ #define GGML_F32Cx8 __m256
1580
  #define GGML_F32Cx8_ZERO (__m256)__lasx_xvldi(0)
1581
  #define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x))
1582
 
1583
+ static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t * x) {
1584
  float tmp[8];
1585
 
1586
  for (int i = 0; i < 8; i++) {
 
1589
 
1590
  return (__m256)__lasx_xvld(tmp, 0);
1591
  }
1592
+ static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
1593
  float arr[8];
1594
 
1595
  __lasx_xvst(y, arr, 0);
1596
 
1597
+ for (int i = 0; i < 8; i++) {
1598
  x[i] = GGML_FP32_TO_FP16(arr[i]);
1599
+ }
1600
  }
1601
  #define GGML_F32Cx8_LOAD(x) __lasx_f32cx8_load(x)
1602
  #define GGML_F32Cx8_STORE(x, y) __lasx_f32cx8_store(x, y)
 
1672
  #define GGML_F16_STEP 32
1673
  #define GGML_F16_EPR 4
1674
 
1675
+ static inline __m128 __lsx_f16x4_load(const ggml_fp16_t * x) {
1676
  float tmp[4];
1677
 
1678
  tmp[0] = GGML_FP16_TO_FP32(x[0]);
 
1683
  return __lsx_vld(tmp, 0);
1684
  }
1685
 
1686
+ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
1687
  float arr[4];
1688
 
1689
  __lsx_vst(y, arr, 0);