Spaces:
Running
Running
Commit
·
ba60f98
1
Parent(s):
85e2387
vulkan : multithread pipeline creation (ggml/963)
Browse files- ggml/src/ggml-vulkan.cpp +37 -4
ggml/src/ggml-vulkan.cpp
CHANGED
|
@@ -20,6 +20,8 @@
|
|
| 20 |
#include <unordered_map>
|
| 21 |
#include <memory>
|
| 22 |
#include <mutex>
|
|
|
|
|
|
|
| 23 |
|
| 24 |
#include "ggml-impl.h"
|
| 25 |
#include "ggml-backend-impl.h"
|
|
@@ -607,13 +609,16 @@ typedef void (*ggml_vk_func_t)(ggml_backend_vk_context * ctx, vk_context& subctx
|
|
| 607 |
|
| 608 |
GGML_CALL static void ggml_backend_vk_free(ggml_backend_t backend);
|
| 609 |
|
| 610 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 611 |
VK_LOG_DEBUG("ggml_vk_create_pipeline(" << device->name << ", " << name << ", " << entrypoint << ", " << parameter_count << ", " << push_constant_size << ", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " << align << ")");
|
| 612 |
GGML_ASSERT(parameter_count > 0);
|
| 613 |
GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); // NOLINT
|
| 614 |
|
| 615 |
-
std::lock_guard<std::mutex> guard(device->mutex);
|
| 616 |
-
|
| 617 |
pipeline = std::make_shared<vk_pipeline_struct>();
|
| 618 |
pipeline->name = name;
|
| 619 |
pipeline->parameter_count = parameter_count;
|
|
@@ -681,7 +686,17 @@ static void ggml_vk_create_pipeline(vk_device& device, vk_pipeline& pipeline, co
|
|
| 681 |
pipeline->layout);
|
| 682 |
pipeline->pipeline = device->device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value;
|
| 683 |
|
| 684 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 685 |
}
|
| 686 |
|
| 687 |
static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline) {
|
|
@@ -1193,6 +1208,20 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
| 1193 |
device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K] = std::make_shared<vk_matmul_pipeline_struct>();
|
| 1194 |
device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL] = std::make_shared<vk_matmul_pipeline_struct>();
|
| 1195 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1196 |
if (device->fp16) {
|
| 1197 |
ggml_vk_create_pipeline(device, device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_f32_len, matmul_f32_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
|
| 1198 |
ggml_vk_create_pipeline(device, device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_f32_len, matmul_f32_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
|
|
@@ -1742,6 +1771,10 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
| 1742 |
ggml_vk_create_pipeline(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16_len, im2col_f32_f16_data, "main", 2, sizeof(vk_op_im2col_push_constants), {256, 1, 1}, {}, 1);
|
| 1743 |
|
| 1744 |
ggml_vk_create_pipeline(device, device->pipeline_timestep_embedding_f32, "timestep_embedding_f32", timestep_embedding_f32_len, timestep_embedding_f32_data, "main", 2, sizeof(vk_op_timestep_embedding_push_constants), {256, 1, 1}, {}, 1);
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1745 |
}
|
| 1746 |
|
| 1747 |
static vk_device ggml_vk_get_device(size_t idx) {
|
|
|
|
| 20 |
#include <unordered_map>
|
| 21 |
#include <memory>
|
| 22 |
#include <mutex>
|
| 23 |
+
#include <future>
|
| 24 |
+
#include <thread>
|
| 25 |
|
| 26 |
#include "ggml-impl.h"
|
| 27 |
#include "ggml-backend-impl.h"
|
|
|
|
| 609 |
|
| 610 |
GGML_CALL static void ggml_backend_vk_free(ggml_backend_t backend);
|
| 611 |
|
| 612 |
+
// variables to track number of compiles in progress
|
| 613 |
+
static uint32_t compile_count = 0;
|
| 614 |
+
static std::mutex compile_count_mutex;
|
| 615 |
+
static std::condition_variable compile_count_cond;
|
| 616 |
+
|
| 617 |
+
static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipeline, const std::string name, size_t spv_size, const void* spv_data, const std::string entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, std::vector<uint32_t> specialization_constants, uint32_t align) {
|
| 618 |
VK_LOG_DEBUG("ggml_vk_create_pipeline(" << device->name << ", " << name << ", " << entrypoint << ", " << parameter_count << ", " << push_constant_size << ", (" << wg_denoms[0] << "," << wg_denoms[1] << "," << wg_denoms[2] << "), specialization_constants, " << align << ")");
|
| 619 |
GGML_ASSERT(parameter_count > 0);
|
| 620 |
GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); // NOLINT
|
| 621 |
|
|
|
|
|
|
|
| 622 |
pipeline = std::make_shared<vk_pipeline_struct>();
|
| 623 |
pipeline->name = name;
|
| 624 |
pipeline->parameter_count = parameter_count;
|
|
|
|
| 686 |
pipeline->layout);
|
| 687 |
pipeline->pipeline = device->device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value;
|
| 688 |
|
| 689 |
+
{
|
| 690 |
+
std::lock_guard<std::mutex> guard(device->mutex);
|
| 691 |
+
device->pipelines.insert({ pipeline->name, pipeline });
|
| 692 |
+
}
|
| 693 |
+
|
| 694 |
+
{
|
| 695 |
+
std::lock_guard<std::mutex> guard(compile_count_mutex);
|
| 696 |
+
assert(compile_count > 0);
|
| 697 |
+
compile_count--;
|
| 698 |
+
}
|
| 699 |
+
compile_count_cond.notify_all();
|
| 700 |
}
|
| 701 |
|
| 702 |
static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline) {
|
|
|
|
| 1208 |
device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_Q6_K] = std::make_shared<vk_matmul_pipeline_struct>();
|
| 1209 |
device->pipeline_dequant_mul_mat_mat_id[GGML_TYPE_IQ4_NL] = std::make_shared<vk_matmul_pipeline_struct>();
|
| 1210 |
|
| 1211 |
+
std::vector<std::future<void>> compiles;
|
| 1212 |
+
auto const &ggml_vk_create_pipeline = [&](vk_device& device, vk_pipeline& pipeline, const std::string &name, size_t spv_size, const void* spv_data, const std::string &entrypoint, uint32_t parameter_count, uint32_t push_constant_size, std::array<uint32_t, 3> wg_denoms, std::vector<uint32_t>&& specialization_constants, uint32_t align) {
|
| 1213 |
+
{
|
| 1214 |
+
// wait until fewer than N compiles are in progress
|
| 1215 |
+
uint32_t N = std::max(1u, std::thread::hardware_concurrency());
|
| 1216 |
+
std::unique_lock<std::mutex> guard(compile_count_mutex);
|
| 1217 |
+
while (compile_count >= N) {
|
| 1218 |
+
compile_count_cond.wait(guard);
|
| 1219 |
+
}
|
| 1220 |
+
compile_count++;
|
| 1221 |
+
}
|
| 1222 |
+
compiles.push_back(std::async(ggml_vk_create_pipeline_func, std::ref(device), std::ref(pipeline), name, spv_size, spv_data, entrypoint, parameter_count, push_constant_size, wg_denoms, specialization_constants, align));
|
| 1223 |
+
};
|
| 1224 |
+
|
| 1225 |
if (device->fp16) {
|
| 1226 |
ggml_vk_create_pipeline(device, device->pipeline_matmul_f32->l, "matmul_f32_l", matmul_f32_f32_len, matmul_f32_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), l_wg_denoms, warptile_l, 1);
|
| 1227 |
ggml_vk_create_pipeline(device, device->pipeline_matmul_f32->m, "matmul_f32_m", matmul_f32_f32_len, matmul_f32_f32_data, "main", 3, sizeof(vk_mat_mat_push_constants), m_wg_denoms, warptile_m, 1);
|
|
|
|
| 1771 |
ggml_vk_create_pipeline(device, device->pipeline_im2col_f32_f16, "im2col_f32_f16", im2col_f32_f16_len, im2col_f32_f16_data, "main", 2, sizeof(vk_op_im2col_push_constants), {256, 1, 1}, {}, 1);
|
| 1772 |
|
| 1773 |
ggml_vk_create_pipeline(device, device->pipeline_timestep_embedding_f32, "timestep_embedding_f32", timestep_embedding_f32_len, timestep_embedding_f32_data, "main", 2, sizeof(vk_op_timestep_embedding_push_constants), {256, 1, 1}, {}, 1);
|
| 1774 |
+
|
| 1775 |
+
for (auto &c : compiles) {
|
| 1776 |
+
c.wait();
|
| 1777 |
+
}
|
| 1778 |
}
|
| 1779 |
|
| 1780 |
static vk_device ggml_vk_get_device(size_t idx) {
|