jeffbolznv commited on
Commit
ba60f98
·
1 Parent(s): 85e2387

vulkan : multithread pipeline creation (ggml/963)

Browse files
Files changed (1) hide show
  1. ggml/src/ggml-vulkan.cpp +37 -4
ggml/src/ggml-vulkan.cpp CHANGED
@@ -20,6 +20,8 @@
20
  #include <unordered_map>
21
  #include <memory>
22
  #include <mutex>
 
 
23
 
24
  #include "ggml-impl.h"
25
  #include "ggml-backend-impl.h"
@@ -607,13 +609,16 @@ typedef void (*ggml_vk_func_t)(ggml_backend_vk_context * ctx, vk_context& subctx
607
 
608
  GGML_CALL static void ggml_backend_vk_free(ggml_backend_t backend);
609
 
610
- static void ggml_vk_create_pipeline(vk_device& device, vk_pipeline& pipeline, const std::string& name, size_t spv_size, const void* spv_data, const std::string& entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, std::vector<uint32_t>&& specialization_constants, uint32_t align) {
 
 
 
 
 
611
  VK_LOG_DEBUG("ggml_vk_create_pipeline(" << device->name << ", " << name << ", " << entrypoint << ", " << parameter_count << ", " << push_constant_size << ", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " << align << ")");
612
  GGML_ASSERT(parameter_count > 0);
613
  GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); // NOLINT
614
 
615
- std::lock_guard<std::mutex> guard(device->mutex);
616
-
617
  pipeline = std::make_shared<vk_pipeline_struct>();
618
  pipeline->name = name;
619
  pipeline->parameter_count = parameter_count;
@@ -681,7 +686,17 @@ static void ggml_vk_create_pipeline(vk_device& device, vk_pipeline& pipeline, co
681
  pipeline->layout);
682
  pipeline->pipeline = device->device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value;
683
 
684
- device->pipelines.insert({ pipeline->name, pipeline });
 
 
 
 
 
 
 
 
 
 
685
  }
686
 
687
  static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline) {
@@ -1193,6 +1208,20 @@ static void ggml_vk_load_shaders(vk_device& device) {
1193
  device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K] = std::make_shared<vk_matmul_pipeline_struct>();
1194
  device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL] = std::make_shared<vk_matmul_pipeline_struct>();
1195
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1196
  if (device->fp16) {
1197
  ggml_vk_create_pipeline(device, device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_f32_len, matmul_f32_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
1198
  ggml_vk_create_pipeline(device, device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_f32_len, matmul_f32_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
@@ -1742,6 +1771,10 @@ static void ggml_vk_load_shaders(vk_device& device) {
1742
  ggml_vk_create_pipeline(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16_len, im2col_f32_f16_data, "main", 2, sizeof(vk_op_im2col_push_constants), {256, 1, 1}, {}, 1);
1743
 
1744
  ggml_vk_create_pipeline(device, device->pipeline_timestep_embedding_f32, "timestep_embedding_f32", timestep_embedding_f32_len, timestep_embedding_f32_data, "main", 2, sizeof(vk_op_timestep_embedding_push_constants), {256, 1, 1}, {}, 1);
 
 
 
 
1745
  }
1746
 
1747
  static vk_device ggml_vk_get_device(size_t idx) {
 
20
  #include <unordered_map>
21
  #include <memory>
22
  #include <mutex>
23
+ #include <future>
24
+ #include <thread>
25
 
26
  #include "ggml-impl.h"
27
  #include "ggml-backend-impl.h"
 
609
 
610
  GGML_CALL static void ggml_backend_vk_free(ggml_backend_t backend);
611
 
612
+ // variables to track number of compiles in progress
613
+ static uint32_t compile_count = 0;
614
+ static std::mutex compile_count_mutex;
615
+ static std::condition_variable compile_count_cond;
616
+
617
+ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipeline, const std::string name, size_t spv_size, const void* spv_data, const std::string entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, std::vector<uint32_t> specialization_constants, uint32_t align) {
618
  VK_LOG_DEBUG("ggml_vk_create_pipeline(" << device->name << ", " << name << ", " << entrypoint << ", " << parameter_count << ", " << push_constant_size << ", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " << align << ")");
619
  GGML_ASSERT(parameter_count > 0);
620
  GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); // NOLINT
621
 
 
 
622
  pipeline = std::make_shared<vk_pipeline_struct>();
623
  pipeline->name = name;
624
  pipeline->parameter_count = parameter_count;
 
686
  pipeline->layout);
687
  pipeline->pipeline = device->device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value;
688
 
689
+ {
690
+ std::lock_guard<std::mutex> guard(device->mutex);
691
+ device->pipelines.insert({ pipeline->name, pipeline });
692
+ }
693
+
694
+ {
695
+ std::lock_guard<std::mutex> guard(compile_count_mutex);
696
+ assert(compile_count > 0);
697
+ compile_count--;
698
+ }
699
+ compile_count_cond.notify_all();
700
  }
701
 
702
  static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline) {
 
1208
  device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K] = std::make_shared<vk_matmul_pipeline_struct>();
1209
  device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL] = std::make_shared<vk_matmul_pipeline_struct>();
1210
 
1211
+ std::vector<std::future<void>> compiles;
1212
+ auto const &ggml_vk_create_pipeline = [&](vk_device& device, vk_pipeline& pipeline, const std::string &name, size_t spv_size, const void* spv_data, const std::string &entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, std::vector<uint32_t>&& specialization_constants, uint32_t align) {
1213
+ {
1214
+ // wait until fewer than N compiles are in progress
1215
+ uint32_t N = std::max(1u, std::thread::hardware_concurrency());
1216
+ std::unique_lock<std::mutex> guard(compile_count_mutex);
1217
+ while (compile_count >= N) {
1218
+ compile_count_cond.wait(guard);
1219
+ }
1220
+ compile_count++;
1221
+ }
1222
+ compiles.push_back(std::async(ggml_vk_create_pipeline_func, std::ref(device), std::ref(pipeline), name, spv_size, spv_data, entrypoint, parameter_count, push_constant_size, wg_denoms, specialization_constants, align));
1223
+ };
1224
+
1225
  if (device->fp16) {
1226
  ggml_vk_create_pipeline(device, device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_f32_len, matmul_f32_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
1227
  ggml_vk_create_pipeline(device, device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_f32_len, matmul_f32_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
 
1771
  ggml_vk_create_pipeline(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16_len, im2col_f32_f16_data, "main", 2, sizeof(vk_op_im2col_push_constants), {256, 1, 1}, {}, 1);
1772
 
1773
  ggml_vk_create_pipeline(device, device->pipeline_timestep_embedding_f32, "timestep_embedding_f32", timestep_embedding_f32_len, timestep_embedding_f32_data, "main", 2, sizeof(vk_op_timestep_embedding_push_constants), {256, 1, 1}, {}, 1);
1774
+
1775
+ for (auto &c : compiles) {
1776
+ c.wait();
1777
+ }
1778
  }
1779
 
1780
  static vk_device ggml_vk_get_device(size_t idx) {