Thomas Fitzsimmons commited on
Commit
e0a5614
·
1 Parent(s): 4a68b87

ggml : reorganize POWER9 ppc64le SIMD code

Browse files
Files changed (1) hide show
  1. ggml.c +23 -142
ggml.c CHANGED
@@ -528,23 +528,21 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
528
 
529
  #elif defined(__POWER9_VECTOR__)
530
 
531
- // TODO: uncomment this when it works
532
- //#define GGML_SIMD
533
 
534
  // F32 POWER9
535
 
536
  #define GGML_F32_STEP 32
537
- #define GGML_F32_EPR 8
538
 
539
- // TODO: not tested !!
540
- #define GGML_F32x4 __vector float
541
- #define GGML_F32x4_ZERO (__vector float){0.0f, 0.0f, 0.0f, 0.0f}
542
- #define GGML_F32x4_SET1(x) (__vector float){x, x, x, x}
543
- #define GGML_F32x4_LOAD vec_vsx_ld
544
- #define GGML_F32x4_STORE vec_vsx_st
545
  #define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a)
546
- #define GGML_F32x4_ADD vec_add
547
- #define GGML_F32x4_MUL vec_mul
548
  #define GGML_F32x4_REDUCE(res, x) \
549
  { \
550
  for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
@@ -573,8 +571,20 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
573
  #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
574
 
575
  // F16 POWER9
576
- // TODO: implement here
577
- // ...
 
 
 
 
 
 
 
 
 
 
 
 
578
 
579
  #elif defined(__wasm_simd128__)
580
 
@@ -777,76 +787,6 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t
777
  for (int i = np; i < n; ++i) {
778
  sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
779
  }
780
- #elif defined(__POWER9_VECTOR__)
781
- // TODO: this is temporary because I cannot fit it in the GGML_SIMD pattern like all other architectures without
782
- // being able to test it. hoping someone with access to a POWER9 machine can help out here.
783
- const int n32 = (n & ~31);
784
-
785
- vector float sum0 = vec_splats (0.0f);
786
- vector float sum1 = vec_splats (0.0f);
787
- vector float sum2 = vec_splats (0.0f);
788
- vector float sum3 = vec_splats (0.0f);
789
- vector float sum4 = vec_splats (0.0f);
790
- vector float sum5 = vec_splats (0.0f);
791
- vector float sum6 = vec_splats (0.0f);
792
- vector float sum7 = vec_splats (0.0f);
793
-
794
- for (int i = 0, j = 0; i < n32; i += 32, j += 64) {
795
- // Use vec_xl, not vec_ld, because x is sometimes unaligned.
796
- vector unsigned short x0 = vec_xl(j + 0, x);
797
- vector unsigned short x1 = vec_xl(j + 16, x);
798
- vector unsigned short x2 = vec_xl(j + 32, x);
799
- vector unsigned short x3 = vec_xl(j + 48, x);
800
-
801
- vector unsigned short y0 = vec_ld(j + 0, y);
802
- vector unsigned short y1 = vec_ld(j + 16, y);
803
- vector unsigned short y2 = vec_ld(j + 32, y);
804
- vector unsigned short y3 = vec_ld(j + 48, y);
805
-
806
- vector float fx0l = vec_extract_fp32_from_shortl(x0);
807
- vector float fx0h = vec_extract_fp32_from_shorth(x0);
808
- vector float fx1l = vec_extract_fp32_from_shortl(x1);
809
- vector float fx1h = vec_extract_fp32_from_shorth(x1);
810
- vector float fx2l = vec_extract_fp32_from_shortl(x2);
811
- vector float fx2h = vec_extract_fp32_from_shorth(x2);
812
- vector float fx3l = vec_extract_fp32_from_shortl(x3);
813
- vector float fx3h = vec_extract_fp32_from_shorth(x3);
814
-
815
- vector float fy0l = vec_extract_fp32_from_shortl(y0);
816
- vector float fy0h = vec_extract_fp32_from_shorth(y0);
817
- vector float fy1l = vec_extract_fp32_from_shortl(y1);
818
- vector float fy1h = vec_extract_fp32_from_shorth(y1);
819
- vector float fy2l = vec_extract_fp32_from_shortl(y2);
820
- vector float fy2h = vec_extract_fp32_from_shorth(y2);
821
- vector float fy3l = vec_extract_fp32_from_shortl(y3);
822
- vector float fy3h = vec_extract_fp32_from_shorth(y3);
823
-
824
- sum0 = vec_madd(fx0l, fy0l, sum0);
825
- sum1 = vec_madd(fx0h, fy0h, sum1);
826
- sum2 = vec_madd(fx1l, fy1l, sum2);
827
- sum3 = vec_madd(fx1h, fy1h, sum3);
828
- sum4 = vec_madd(fx2l, fy2l, sum4);
829
- sum5 = vec_madd(fx2h, fy2h, sum5);
830
- sum6 = vec_madd(fx3l, fy3l, sum6);
831
- sum7 = vec_madd(fx3h, fy3h, sum7);
832
- }
833
-
834
- sum0 = vec_add(sum0, sum1);
835
- sum2 = vec_add(sum2, sum3);
836
- sum4 = vec_add(sum4, sum5);
837
- sum6 = vec_add(sum6, sum7);
838
-
839
- sum0 = vec_add(sum0, sum2);
840
- sum4 = vec_add(sum4, sum6);
841
-
842
- sum0 = vec_add(sum0, sum4);
843
-
844
- sumf = vec_extract(sum0, 0) + vec_extract(sum0, 1)
845
- + vec_extract(sum0, 2) + vec_extract(sum0, 3);
846
-
847
- for (int i = n32; i < n; ++i) {
848
- sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
849
- }
850
  #else
851
  for (int i = 0; i < n; ++i) {
852
  sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
@@ -911,65 +851,6 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_
911
  GGML_ASSERT(false);
912
  y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
913
  }
914
- #elif defined(__POWER9_VECTOR__)
915
- // TODO: this is temporary because I cannot fit it in the GGML_SIMD pattern like all other architectures without
916
- // being able to test it. hoping someone with access to a POWER9 machine can help out here.
917
- const int n32 = (n & ~31);
918
- for (int i = 0, j = 0; i < n32; i += 32, j += 64) {
919
- // Use vec_xl, not vec_ld, because x is sometimes unaligned!
920
- vector unsigned short x0 = vec_xl(j + 0, x);
921
- vector unsigned short x1 = vec_xl(j + 16, x);
922
- vector unsigned short x2 = vec_xl(j + 32, x);
923
- vector unsigned short x3 = vec_xl(j + 48, x);
924
-
925
- vector unsigned short y0 = vec_xl(j + 0, y);
926
- vector unsigned short y1 = vec_xl(j + 16, y);
927
- vector unsigned short y2 = vec_xl(j + 32, y);
928
- vector unsigned short y3 = vec_xl(j + 48, y);
929
-
930
- vector float v4 = vec_splats(v);
931
-
932
- vector float fx0l = vec_extract_fp32_from_shortl(x0);
933
- vector float fx0h = vec_extract_fp32_from_shorth(x0);
934
- vector float fx1l = vec_extract_fp32_from_shortl(x1);
935
- vector float fx1h = vec_extract_fp32_from_shorth(x1);
936
- vector float fx2l = vec_extract_fp32_from_shortl(x2);
937
- vector float fx2h = vec_extract_fp32_from_shorth(x2);
938
- vector float fx3l = vec_extract_fp32_from_shortl(x3);
939
- vector float fx3h = vec_extract_fp32_from_shorth(x3);
940
-
941
- vector float fy0l = vec_extract_fp32_from_shortl(y0);
942
- vector float fy0h = vec_extract_fp32_from_shorth(y0);
943
- vector float fy1l = vec_extract_fp32_from_shortl(y1);
944
- vector float fy1h = vec_extract_fp32_from_shorth(y1);
945
- vector float fy2l = vec_extract_fp32_from_shortl(y2);
946
- vector float fy2h = vec_extract_fp32_from_shorth(y2);
947
- vector float fy3l = vec_extract_fp32_from_shortl(y3);
948
- vector float fy3h = vec_extract_fp32_from_shorth(y3);
949
-
950
- fy0l = vec_madd(fx0l, v4, fy0l);
951
- fy0h = vec_madd(fx0h, v4, fy0h);
952
- fy1l = vec_madd(fx1l, v4, fy1l);
953
- fy1h = vec_madd(fx1h, v4, fy1h);
954
- fy2l = vec_madd(fx2l, v4, fy2l);
955
- fy2h = vec_madd(fx2h, v4, fy2h);
956
- fy3l = vec_madd(fx3l, v4, fy3l);
957
- fy3h = vec_madd(fx3h, v4, fy3h);
958
-
959
- y0 = vec_pack_to_short_fp32(fy0h, fy0l);
960
- y1 = vec_pack_to_short_fp32(fy1h, fy1l);
961
- y2 = vec_pack_to_short_fp32(fy2h, fy2l);
962
- y3 = vec_pack_to_short_fp32(fy3h, fy3l);
963
-
964
- vec_xst(y0, j + 0, y);
965
- vec_xst(y1, j + 16, y);
966
- vec_xst(y2, j + 32, y);
967
- vec_xst(y3, j + 48, y);
968
- }
969
-
970
- for (int i = n32; i < n; ++i) {
971
- y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
972
- }
973
  #else
974
  for (int i = 0; i < n; ++i) {
975
  y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
 
528
 
529
  #elif defined(__POWER9_VECTOR__)
530
 
531
+ #define GGML_SIMD
 
532
 
533
  // F32 POWER9
534
 
535
  #define GGML_F32_STEP 32
536
+ #define GGML_F32_EPR 4
537
 
538
+ #define GGML_F32x4 vector float
539
+ #define GGML_F32x4_ZERO 0.0f
540
+ #define GGML_F32x4_SET1 vec_splats
541
+ #define GGML_F32x4_LOAD(p) vec_xl(0, p)
542
+ #define GGML_F32x4_STORE(p, r) vec_xst(r, 0, p)
 
543
  #define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a)
544
+ #define GGML_F32x4_ADD vec_add
545
+ #define GGML_F32x4_MUL vec_mul
546
  #define GGML_F32x4_REDUCE(res, x) \
547
  { \
548
  for (int i = 0; i < GGML_F32_ARR/2; ++i) { \
 
571
  #define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
572
 
573
  // F16 POWER9
574
+ #define GGML_F16_STEP GGML_F32_STEP
575
+ #define GGML_F16_EPR GGML_F32_EPR
576
+ #define GGML_F16_VEC GGML_F32x4
577
+ #define GGML_F16_VEC_ZERO GGML_F32x4_ZERO
578
+ #define GGML_F16_VEC_SET1 GGML_F32x4_SET1
579
+ #define GGML_F16_VEC_FMA GGML_F32x4_FMA
580
+ #define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
581
+ // Use vec_xl, not vec_ld, in case the load address is not aligned.
582
+ #define GGML_F16_VEC_LOAD(p, i) (i & 0x1) ? \
583
+ vec_extract_fp32_from_shorth(vec_xl(0, p - GGML_F16_EPR)) : \
584
+ vec_extract_fp32_from_shortl(vec_xl(0, p))
585
+ #define GGML_F16_VEC_STORE(p, r, i) \
586
+ if (i & 0x1) \
587
+ vec_xst(vec_pack_to_short_fp32(r[i], r[i - 1]), 0, p - GGML_F16_EPR)
588
 
589
  #elif defined(__wasm_simd128__)
590
 
 
787
  for (int i = np; i < n; ++i) {
788
  sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
789
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
790
  #else
791
  for (int i = 0; i < n; ++i) {
792
  sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);
 
851
  GGML_ASSERT(false);
852
  y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);
853
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
854
  #else
855
  for (int i = 0; i < n; ++i) {
856
  y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);