Spaces:
Running
Running
Konstantin Zhuravlyov
commited on
ggml : use __builtin_amdgcn_sudot4 in __dp4a for gfx11 (llama/4787)
Browse files- ggml-cuda.cu +1 -1
ggml-cuda.cu
CHANGED
|
@@ -183,7 +183,7 @@ static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
|
|
| 183 |
static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
|
| 184 |
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
|
| 185 |
c = __builtin_amdgcn_sdot4(a, b, c, false);
|
| 186 |
-
#elif defined(
|
| 187 |
c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
|
| 188 |
#elif defined(__gfx1010__) || defined(__gfx900__)
|
| 189 |
int tmp1;
|
|
|
|
| 183 |
static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
|
| 184 |
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
|
| 185 |
c = __builtin_amdgcn_sdot4(a, b, c, false);
|
| 186 |
+
#elif defined(RDNA3)
|
| 187 |
c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
|
| 188 |
#elif defined(__gfx1010__) || defined(__gfx900__)
|
| 189 |
int tmp1;
|