Spaces:
Running
Running
Thomas Fitzsimmons
commited on
Commit
·
e0a5614
1
Parent(s):
4a68b87
ggml : reorganize POWER9 ppc64le SIMD code
Browse files
ggml.c
CHANGED
|
@@ -528,23 +528,21 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
|
|
| 528 |
|
| 529 |
#elif defined(__POWER9_VECTOR__)
|
| 530 |
|
| 531 |
-
|
| 532 |
-
//#define GGML_SIMD
|
| 533 |
|
| 534 |
// F32 POWER9
|
| 535 |
|
| 536 |
#define GGML_F32_STEP 32
|
| 537 |
-
#define GGML_F32_EPR
|
| 538 |
|
| 539 |
-
|
| 540 |
-
#define
|
| 541 |
-
#define
|
| 542 |
-
#define
|
| 543 |
-
#define
|
| 544 |
-
#define GGML_F32x4_STORE vec_vsx_st
|
| 545 |
#define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a)
|
| 546 |
-
#define GGML_F32x4_ADD
|
| 547 |
-
#define GGML_F32x4_MUL
|
| 548 |
#define GGML_F32x4_REDUCE(res, x) \
|
| 549 |
{ \
|
| 550 |
for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
|
|
@@ -573,8 +571,20 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
|
|
| 573 |
#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
|
| 574 |
|
| 575 |
// F16 POWER9
|
| 576 |
-
|
| 577 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 578 |
|
| 579 |
#elif defined(__wasm_simd128__)
|
| 580 |
|
|
@@ -777,76 +787,6 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t
|
|
| 777 |
for (int i = np; i < n; ++i) {
|
| 778 |
sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
|
| 779 |
}
|
| 780 |
-
#elif defined(__POWER9_VECTOR__)
|
| 781 |
-
// TODO: this is temporary because I cannot fit it in the GGML_SIMD pattern like all other architectures without
|
| 782 |
-
// being able to test it. hoping someone with access to a POWER9 machine can help out here.
|
| 783 |
-
const int n32 = (n & ~31);
|
| 784 |
-
|
| 785 |
-
vector float sum0 = vec_splats (0.0f);
|
| 786 |
-
vector float sum1 = vec_splats (0.0f);
|
| 787 |
-
vector float sum2 = vec_splats (0.0f);
|
| 788 |
-
vector float sum3 = vec_splats (0.0f);
|
| 789 |
-
vector float sum4 = vec_splats (0.0f);
|
| 790 |
-
vector float sum5 = vec_splats (0.0f);
|
| 791 |
-
vector float sum6 = vec_splats (0.0f);
|
| 792 |
-
vector float sum7 = vec_splats (0.0f);
|
| 793 |
-
|
| 794 |
-
for (int i = 0, j = 0; i < n32; i += 32, j += 64) {
|
| 795 |
-
// Use vec_xl, not vec_ld, because x is sometimes unaligned.
|
| 796 |
-
vector unsigned short x0 = vec_xl(j + 0, x);
|
| 797 |
-
vector unsigned short x1 = vec_xl(j + 16, x);
|
| 798 |
-
vector unsigned short x2 = vec_xl(j + 32, x);
|
| 799 |
-
vector unsigned short x3 = vec_xl(j + 48, x);
|
| 800 |
-
|
| 801 |
-
vector unsigned short y0 = vec_ld(j + 0, y);
|
| 802 |
-
vector unsigned short y1 = vec_ld(j + 16, y);
|
| 803 |
-
vector unsigned short y2 = vec_ld(j + 32, y);
|
| 804 |
-
vector unsigned short y3 = vec_ld(j + 48, y);
|
| 805 |
-
|
| 806 |
-
vector float fx0l = vec_extract_fp32_from_shortl(x0);
|
| 807 |
-
vector float fx0h = vec_extract_fp32_from_shorth(x0);
|
| 808 |
-
vector float fx1l = vec_extract_fp32_from_shortl(x1);
|
| 809 |
-
vector float fx1h = vec_extract_fp32_from_shorth(x1);
|
| 810 |
-
vector float fx2l = vec_extract_fp32_from_shortl(x2);
|
| 811 |
-
vector float fx2h = vec_extract_fp32_from_shorth(x2);
|
| 812 |
-
vector float fx3l = vec_extract_fp32_from_shortl(x3);
|
| 813 |
-
vector float fx3h = vec_extract_fp32_from_shorth(x3);
|
| 814 |
-
|
| 815 |
-
vector float fy0l = vec_extract_fp32_from_shortl(y0);
|
| 816 |
-
vector float fy0h = vec_extract_fp32_from_shorth(y0);
|
| 817 |
-
vector float fy1l = vec_extract_fp32_from_shortl(y1);
|
| 818 |
-
vector float fy1h = vec_extract_fp32_from_shorth(y1);
|
| 819 |
-
vector float fy2l = vec_extract_fp32_from_shortl(y2);
|
| 820 |
-
vector float fy2h = vec_extract_fp32_from_shorth(y2);
|
| 821 |
-
vector float fy3l = vec_extract_fp32_from_shortl(y3);
|
| 822 |
-
vector float fy3h = vec_extract_fp32_from_shorth(y3);
|
| 823 |
-
|
| 824 |
-
sum0 = vec_madd(fx0l, fy0l, sum0);
|
| 825 |
-
sum1 = vec_madd(fx0h, fy0h, sum1);
|
| 826 |
-
sum2 = vec_madd(fx1l, fy1l, sum2);
|
| 827 |
-
sum3 = vec_madd(fx1h, fy1h, sum3);
|
| 828 |
-
sum4 = vec_madd(fx2l, fy2l, sum4);
|
| 829 |
-
sum5 = vec_madd(fx2h, fy2h, sum5);
|
| 830 |
-
sum6 = vec_madd(fx3l, fy3l, sum6);
|
| 831 |
-
sum7 = vec_madd(fx3h, fy3h, sum7);
|
| 832 |
-
}
|
| 833 |
-
|
| 834 |
-
sum0 = vec_add(sum0, sum1);
|
| 835 |
-
sum2 = vec_add(sum2, sum3);
|
| 836 |
-
sum4 = vec_add(sum4, sum5);
|
| 837 |
-
sum6 = vec_add(sum6, sum7);
|
| 838 |
-
|
| 839 |
-
sum0 = vec_add(sum0, sum2);
|
| 840 |
-
sum4 = vec_add(sum4, sum6);
|
| 841 |
-
|
| 842 |
-
sum0 = vec_add(sum0, sum4);
|
| 843 |
-
|
| 844 |
-
sumf = vec_extract(sum0, 0) + vec_extract(sum0, 1)
|
| 845 |
-
+ vec_extract(sum0, 2) + vec_extract(sum0, 3);
|
| 846 |
-
|
| 847 |
-
for (int i = n32; i < n; ++i) {
|
| 848 |
-
sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
|
| 849 |
-
}
|
| 850 |
#else
|
| 851 |
for (int i = 0; i < n; ++i) {
|
| 852 |
sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
|
|
@@ -911,65 +851,6 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_
|
|
| 911 |
GGML_ASSERT(false);
|
| 912 |
y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
|
| 913 |
}
|
| 914 |
-
#elif defined(__POWER9_VECTOR__)
|
| 915 |
-
// TODO: this is temporary because I cannot fit it in the GGML_SIMD pattern like all other architectures without
|
| 916 |
-
// being able to test it. hoping someone with access to a POWER9 machine can help out here.
|
| 917 |
-
const int n32 = (n & ~31);
|
| 918 |
-
for (int i = 0, j = 0; i < n32; i += 32, j += 64) {
|
| 919 |
-
// Use vec_xl, not vec_ld, because x is sometimes unaligned!
|
| 920 |
-
vector unsigned short x0 = vec_xl(j + 0, x);
|
| 921 |
-
vector unsigned short x1 = vec_xl(j + 16, x);
|
| 922 |
-
vector unsigned short x2 = vec_xl(j + 32, x);
|
| 923 |
-
vector unsigned short x3 = vec_xl(j + 48, x);
|
| 924 |
-
|
| 925 |
-
vector unsigned short y0 = vec_xl(j + 0, y);
|
| 926 |
-
vector unsigned short y1 = vec_xl(j + 16, y);
|
| 927 |
-
vector unsigned short y2 = vec_xl(j + 32, y);
|
| 928 |
-
vector unsigned short y3 = vec_xl(j + 48, y);
|
| 929 |
-
|
| 930 |
-
vector float v4 = vec_splats(v);
|
| 931 |
-
|
| 932 |
-
vector float fx0l = vec_extract_fp32_from_shortl(x0);
|
| 933 |
-
vector float fx0h = vec_extract_fp32_from_shorth(x0);
|
| 934 |
-
vector float fx1l = vec_extract_fp32_from_shortl(x1);
|
| 935 |
-
vector float fx1h = vec_extract_fp32_from_shorth(x1);
|
| 936 |
-
vector float fx2l = vec_extract_fp32_from_shortl(x2);
|
| 937 |
-
vector float fx2h = vec_extract_fp32_from_shorth(x2);
|
| 938 |
-
vector float fx3l = vec_extract_fp32_from_shortl(x3);
|
| 939 |
-
vector float fx3h = vec_extract_fp32_from_shorth(x3);
|
| 940 |
-
|
| 941 |
-
vector float fy0l = vec_extract_fp32_from_shortl(y0);
|
| 942 |
-
vector float fy0h = vec_extract_fp32_from_shorth(y0);
|
| 943 |
-
vector float fy1l = vec_extract_fp32_from_shortl(y1);
|
| 944 |
-
vector float fy1h = vec_extract_fp32_from_shorth(y1);
|
| 945 |
-
vector float fy2l = vec_extract_fp32_from_shortl(y2);
|
| 946 |
-
vector float fy2h = vec_extract_fp32_from_shorth(y2);
|
| 947 |
-
vector float fy3l = vec_extract_fp32_from_shortl(y3);
|
| 948 |
-
vector float fy3h = vec_extract_fp32_from_shorth(y3);
|
| 949 |
-
|
| 950 |
-
fy0l = vec_madd(fx0l, v4, fy0l);
|
| 951 |
-
fy0h = vec_madd(fx0h, v4, fy0h);
|
| 952 |
-
fy1l = vec_madd(fx1l, v4, fy1l);
|
| 953 |
-
fy1h = vec_madd(fx1h, v4, fy1h);
|
| 954 |
-
fy2l = vec_madd(fx2l, v4, fy2l);
|
| 955 |
-
fy2h = vec_madd(fx2h, v4, fy2h);
|
| 956 |
-
fy3l = vec_madd(fx3l, v4, fy3l);
|
| 957 |
-
fy3h = vec_madd(fx3h, v4, fy3h);
|
| 958 |
-
|
| 959 |
-
y0 = vec_pack_to_short_fp32(fy0h, fy0l);
|
| 960 |
-
y1 = vec_pack_to_short_fp32(fy1h, fy1l);
|
| 961 |
-
y2 = vec_pack_to_short_fp32(fy2h, fy2l);
|
| 962 |
-
y3 = vec_pack_to_short_fp32(fy3h, fy3l);
|
| 963 |
-
|
| 964 |
-
vec_xst(y0, j + 0, y);
|
| 965 |
-
vec_xst(y1, j + 16, y);
|
| 966 |
-
vec_xst(y2, j + 32, y);
|
| 967 |
-
vec_xst(y3, j + 48, y);
|
| 968 |
-
}
|
| 969 |
-
|
| 970 |
-
for (int i = n32; i < n; ++i) {
|
| 971 |
-
y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
|
| 972 |
-
}
|
| 973 |
#else
|
| 974 |
for (int i = 0; i < n; ++i) {
|
| 975 |
y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
|
|
|
|
| 528 |
|
| 529 |
#elif defined(__POWER9_VECTOR__)
|
| 530 |
|
| 531 |
+
#define GGML_SIMD
|
|
|
|
| 532 |
|
| 533 |
// F32 POWER9
|
| 534 |
|
| 535 |
#define GGML_F32_STEP 32
|
| 536 |
+
#define GGML_F32_EPR 4
|
| 537 |
|
| 538 |
+
#define GGML_F32x4 vector float
|
| 539 |
+
#define GGML_F32x4_ZERO 0.0f
|
| 540 |
+
#define GGML_F32x4_SET1 vec_splats
|
| 541 |
+
#define GGML_F32x4_LOAD(p) vec_xl(0, p)
|
| 542 |
+
#define GGML_F32x4_STORE(p, r) vec_xst(r, 0, p)
|
|
|
|
| 543 |
#define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a)
|
| 544 |
+
#define GGML_F32x4_ADD vec_add
|
| 545 |
+
#define GGML_F32x4_MUL vec_mul
|
| 546 |
#define GGML_F32x4_REDUCE(res, x) \
|
| 547 |
{ \
|
| 548 |
for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
|
|
|
|
| 571 |
#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
|
| 572 |
|
| 573 |
// F16 POWER9
|
| 574 |
+
#define GGML_F16_STEP GGML_F32_STEP
|
| 575 |
+
#define GGML_F16_EPR GGML_F32_EPR
|
| 576 |
+
#define GGML_F16_VEC GGML_F32x4
|
| 577 |
+
#define GGML_F16_VEC_ZERO GGML_F32x4_ZERO
|
| 578 |
+
#define GGML_F16_VEC_SET1 GGML_F32x4_SET1
|
| 579 |
+
#define GGML_F16_VEC_FMA GGML_F32x4_FMA
|
| 580 |
+
#define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
|
| 581 |
+
// Use vec_xl, not vec_ld, in case the load address is not aligned.
|
| 582 |
+
#define GGML_F16_VEC_LOAD(p, i) (i & 0x1) ? \
|
| 583 |
+
vec_extract_fp32_from_shorth(vec_xl(0, p - GGML_F16_EPR)) : \
|
| 584 |
+
vec_extract_fp32_from_shortl(vec_xl(0, p))
|
| 585 |
+
#define GGML_F16_VEC_STORE(p, r, i) \
|
| 586 |
+
if (i & 0x1) \
|
| 587 |
+
vec_xst(vec_pack_to_short_fp32(r[i], r[i - 1]), 0, p - GGML_F16_EPR)
|
| 588 |
|
| 589 |
#elif defined(__wasm_simd128__)
|
| 590 |
|
|
|
|
| 787 |
for (int i = np; i < n; ++i) {
|
| 788 |
sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
|
| 789 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 790 |
#else
|
| 791 |
for (int i = 0; i < n; ++i) {
|
| 792 |
sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
|
|
|
|
| 851 |
GGML_ASSERT(false);
|
| 852 |
y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
|
| 853 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 854 |
#else
|
| 855 |
for (int i = 0; i < n; ++i) {
|
| 856 |
y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
|