Spaces:
Running
Running
ci : enable -Werror for CUDA builds (llama/5579)
Browse files* cmake : pass -Werror through -Xcompiler
ggml-ci
* make, cmake : enable CUDA errors on warnings
ggml-ci
- ggml-cuda.cu +26 -24
ggml-cuda.cu
CHANGED
|
@@ -651,18 +651,18 @@ static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
|
|
| 651 |
return a;
|
| 652 |
}
|
| 653 |
|
| 654 |
-
static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
|
| 655 |
-
|
| 656 |
-
|
| 657 |
-
for (int mask = 16; mask > 0; mask >>= 1) {
|
| 658 |
-
a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
|
| 659 |
-
}
|
| 660 |
-
return a;
|
| 661 |
-
|
| 662 |
-
(void) a;
|
| 663 |
-
NO_DEVICE_CODE;
|
| 664 |
-
|
| 665 |
-
}
|
| 666 |
|
| 667 |
static __device__ __forceinline__ float warp_reduce_max(float x) {
|
| 668 |
#pragma unroll
|
|
@@ -672,18 +672,18 @@ static __device__ __forceinline__ float warp_reduce_max(float x) {
|
|
| 672 |
return x;
|
| 673 |
}
|
| 674 |
|
| 675 |
-
static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
|
| 676 |
-
|
| 677 |
-
|
| 678 |
-
for (int mask = 16; mask > 0; mask >>= 1) {
|
| 679 |
-
x = __hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
|
| 680 |
-
}
|
| 681 |
-
return x;
|
| 682 |
-
|
| 683 |
-
(void) x;
|
| 684 |
-
NO_DEVICE_CODE;
|
| 685 |
-
|
| 686 |
-
}
|
| 687 |
|
| 688 |
static __device__ __forceinline__ float op_repeat(const float a, const float b) {
|
| 689 |
return b;
|
|
@@ -4641,10 +4641,12 @@ static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1(
|
|
| 4641 |
const float d = (float)bq2->d * __low2float(bq8_1[ib32].ds) * 0.25f;
|
| 4642 |
return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
|
| 4643 |
#else
|
|
|
|
| 4644 |
assert(false);
|
| 4645 |
return 0.f;
|
| 4646 |
#endif
|
| 4647 |
#else
|
|
|
|
| 4648 |
assert(false);
|
| 4649 |
return 0.f;
|
| 4650 |
#endif
|
|
|
|
| 651 |
return a;
|
| 652 |
}
|
| 653 |
|
| 654 |
+
//static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
|
| 655 |
+
//#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
| 656 |
+
//#pragma unroll
|
| 657 |
+
// for (int mask = 16; mask > 0; mask >>= 1) {
|
| 658 |
+
// a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
|
| 659 |
+
// }
|
| 660 |
+
// return a;
|
| 661 |
+
//#else
|
| 662 |
+
// (void) a;
|
| 663 |
+
// NO_DEVICE_CODE;
|
| 664 |
+
//#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
| 665 |
+
//}
|
| 666 |
|
| 667 |
static __device__ __forceinline__ float warp_reduce_max(float x) {
|
| 668 |
#pragma unroll
|
|
|
|
| 672 |
return x;
|
| 673 |
}
|
| 674 |
|
| 675 |
+
//static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
|
| 676 |
+
//#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
|
| 677 |
+
//#pragma unroll
|
| 678 |
+
// for (int mask = 16; mask > 0; mask >>= 1) {
|
| 679 |
+
// x = __hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
|
| 680 |
+
// }
|
| 681 |
+
// return x;
|
| 682 |
+
//#else
|
| 683 |
+
// (void) x;
|
| 684 |
+
// NO_DEVICE_CODE;
|
| 685 |
+
//#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
|
| 686 |
+
//}
|
| 687 |
|
| 688 |
static __device__ __forceinline__ float op_repeat(const float a, const float b) {
|
| 689 |
return b;
|
|
|
|
| 4641 |
const float d = (float)bq2->d * __low2float(bq8_1[ib32].ds) * 0.25f;
|
| 4642 |
return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2);
|
| 4643 |
#else
|
| 4644 |
+
(void) ksigns64;
|
| 4645 |
assert(false);
|
| 4646 |
return 0.f;
|
| 4647 |
#endif
|
| 4648 |
#else
|
| 4649 |
+
(void) ksigns64;
|
| 4650 |
assert(false);
|
| 4651 |
return 0.f;
|
| 4652 |
#endif
|