Erik Scholz commited on
Commit
4ec988a
·
1 Parent(s): d6b6852

CUDA: compress mode option and default to size (llama/12029)

Browse files

cuda 12.8 added the option to specify stronger compression for binaries, so we now default to "size".

ggml/CMakeLists.txt CHANGED
@@ -155,6 +155,9 @@ option(GGML_CUDA_NO_VMM "ggml: do not try to use CUDA VMM"
155
  option(GGML_CUDA_FA "ggml: compile ggml FlashAttention CUDA kernels" ON)
156
  option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF)
157
  option(GGML_CUDA_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" ${GGML_CUDA_GRAPHS_DEFAULT})
 
 
 
158
 
159
  option(GGML_HIP "ggml: use HIP" OFF)
160
  option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental, slow" OFF)
 
155
  option(GGML_CUDA_FA "ggml: compile ggml FlashAttention CUDA kernels" ON)
156
  option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF)
157
  option(GGML_CUDA_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" ${GGML_CUDA_GRAPHS_DEFAULT})
158
+ set (GGML_CUDA_COMPRESSION_MODE "size" CACHE STRING
159
+ "ggml: cuda link binary compression mode; requires cuda 12.8+")
160
+ set_property(CACHE GGML_CUDA_COMPRESSION_MODE PROPERTY STRINGS "none;speed;balance;size")
161
 
162
  option(GGML_HIP "ggml: use HIP" OFF)
163
  option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental, slow" OFF)
ggml/src/ggml-cuda/CMakeLists.txt CHANGED
@@ -102,6 +102,15 @@ if (CUDAToolkit_FOUND)
102
 
103
  set(CUDA_FLAGS -use_fast_math)
104
 
 
 
 
 
 
 
 
 
 
105
  if (GGML_FATAL_WARNINGS)
106
  list(APPEND CUDA_FLAGS -Werror all-warnings)
107
  endif()
 
102
 
103
  set(CUDA_FLAGS -use_fast_math)
104
 
105
+ if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.8")
106
+ # Options are:
107
+ # - none (not recommended)
108
+ # - speed (nvcc's default)
109
+ # - balance
110
+ # - size
111
+ list(APPEND CUDA_FLAGS -compress-mode=${GGML_CUDA_COMPRESSION_MODE})
112
+ endif()
113
+
114
  if (GGML_FATAL_WARNINGS)
115
  list(APPEND CUDA_FLAGS -Werror all-warnings)
116
  endif()