Spaces:
Running
Running
Chenguang Li
commited on
Commit
·
f013e2d
1
Parent(s):
7681e32
CANN: Support MOE Model MUL_MAT_ID (llama/13042)
Browse filesSigned-off-by: noemotiovon <[email protected]>
ggml/src/ggml-cann/aclnn_ops.cpp
CHANGED
|
@@ -65,6 +65,7 @@
|
|
| 65 |
#include <aclnnop/aclnn_eq_tensor.h>
|
| 66 |
#include <aclnnop/aclnn_gt_scalar.h>
|
| 67 |
#include <aclnnop/aclnn_pow.h>
|
|
|
|
| 68 |
#include <float.h>
|
| 69 |
|
| 70 |
#include <cmath>
|
|
@@ -2587,3 +2588,149 @@ void ggml_cann_step(ggml_backend_cann_context& ctx, ggml_tensor* dst){
|
|
| 2587 |
|
| 2588 |
ggml_cann_release_resources(ctx, acl_src, acl_dst, alpha);
|
| 2589 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
#include <aclnnop/aclnn_eq_tensor.h>
|
| 66 |
#include <aclnnop/aclnn_gt_scalar.h>
|
| 67 |
#include <aclnnop/aclnn_pow.h>
|
| 68 |
+
#include <aclnnop/aclnn_grouped_matmul_v2.h>
|
| 69 |
#include <float.h>
|
| 70 |
|
| 71 |
#include <cmath>
|
|
|
|
| 2588 |
|
| 2589 |
ggml_cann_release_resources(ctx, acl_src, acl_dst, alpha);
|
| 2590 |
}
|
| 2591 |
+
|
| 2592 |
+
/**
|
| 2593 |
+
* @brief Performs expert-specific matrix multiplication (MoE) with
|
| 2594 |
+
* floating-point precision using the CANN backend.
|
| 2595 |
+
*
|
| 2596 |
+
* This function executes a matrix multiplication operation tailored for
|
| 2597 |
+
* Mixture of Experts (MoE) models, where the input tensor is multiplied
|
| 2598 |
+
* with expert-specific weight matrices. It uses the CANN backend for
|
| 2599 |
+
* efficient computation and stores the result in the destination tensor `dst`.
|
| 2600 |
+
* The operation may leverage identity-based optimizations or routing masks
|
| 2601 |
+
* as part of sparse expert selection.
|
| 2602 |
+
*
|
| 2603 |
+
* @param ctx The context for executing CANN backend operations.
|
| 2604 |
+
* @param dst The destination tensor where the MoE multiplication result
|
| 2605 |
+
* will be stored.
|
| 2606 |
+
*
|
| 2607 |
+
* @note This function assumes floating-point data types and is designed for
|
| 2608 |
+
* MoE architectures, possibly involving sparse expert routing.
|
| 2609 |
+
*/
|
| 2610 |
+
static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
| 2611 |
+
//dst [M, K, N, 1]
|
| 2612 |
+
ggml_tensor * src0 = dst->src[0]; //src0 [D, M, A, 1]
|
| 2613 |
+
ggml_tensor * src1 = dst->src[1]; //src1 [D, B, N, 1], B = K or B = 1
|
| 2614 |
+
ggml_tensor * ids = dst->src[2]; //ids [K, N]
|
| 2615 |
+
|
| 2616 |
+
GGML_TENSOR_BINARY_OP_LOCALS
|
| 2617 |
+
|
| 2618 |
+
// copy index from npu to cpu
|
| 2619 |
+
int64_t n_as = ne02; // A
|
| 2620 |
+
int64_t n_ids = ids->ne[0]; // K
|
| 2621 |
+
|
| 2622 |
+
std::vector<char> ids_host(ggml_nbytes(ids));
|
| 2623 |
+
ggml_cann_async_memcpy(ctx, ids_host.data(), ids->data, ggml_nbytes(ids),
|
| 2624 |
+
ACL_MEMCPY_DEVICE_TO_HOST);
|
| 2625 |
+
ACL_CHECK(aclrtSynchronizeStream(ctx.stream()));
|
| 2626 |
+
|
| 2627 |
+
char * src0_original = (char *) src0->data;
|
| 2628 |
+
char * src1_original = (char *) src1->data;
|
| 2629 |
+
char * dst_original = (char *) dst->data;
|
| 2630 |
+
size_t ori_src0_nb[4] = {nb00, nb01, nb02, nb03};
|
| 2631 |
+
|
| 2632 |
+
// src0 is F16, src1 is F32, dst is F32
|
| 2633 |
+
ggml_cann_pool_alloc src0_cast_allocator;
|
| 2634 |
+
if (src0->type == GGML_TYPE_F16) {
|
| 2635 |
+
src0_cast_allocator.alloc(ctx.pool(), sizeof(float) * ggml_nelements(src0));
|
| 2636 |
+
void* src0_cast_buf = src0_cast_allocator.get();
|
| 2637 |
+
|
| 2638 |
+
size_t cast_nb[GGML_MAX_DIMS];
|
| 2639 |
+
cast_nb[0] = sizeof(float_t);
|
| 2640 |
+
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
| 2641 |
+
cast_nb[i] = cast_nb[i - 1] * src0->ne[i - 1];
|
| 2642 |
+
}
|
| 2643 |
+
|
| 2644 |
+
aclTensor* acl_src0_f16 = ggml_cann_create_tensor(src0);
|
| 2645 |
+
aclTensor* acl_cast = ggml_cann_create_tensor(src0_cast_buf,
|
| 2646 |
+
ACL_FLOAT, sizeof(float), src0->ne, cast_nb, 4);
|
| 2647 |
+
GGML_CANN_CALL_ACLNN_OP(ctx, Cast, acl_src0_f16, ACL_FLOAT, acl_cast);
|
| 2648 |
+
ggml_cann_release_resources(ctx, acl_cast, acl_src0_f16);
|
| 2649 |
+
|
| 2650 |
+
src0_original = (char *) src0_cast_buf;
|
| 2651 |
+
memcpy(ori_src0_nb, cast_nb, sizeof(ori_src0_nb));
|
| 2652 |
+
}
|
| 2653 |
+
|
| 2654 |
+
std::vector<aclTensor*> src0_tensor_vec;
|
| 2655 |
+
std::vector<aclTensor*> src1_tensor_vec;
|
| 2656 |
+
std::vector<aclTensor*> dst_tensor_vec;
|
| 2657 |
+
for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
|
| 2658 |
+
for (int64_t id = 0; id < n_ids; id++) {
|
| 2659 |
+
// src0_row [M, D] -> weight && permute
|
| 2660 |
+
int64_t src0_ne[2] = {ne01, ne00};
|
| 2661 |
+
size_t src0_nb[2] = {ori_src0_nb[1], ori_src0_nb[0]};
|
| 2662 |
+
// src1_row [D, 1] -> input
|
| 2663 |
+
int64_t src1_ne[2] = {ne10, 1};
|
| 2664 |
+
size_t src1_nb[2] = {nb10, nb11};
|
| 2665 |
+
// dst_row [M, 1] -> out
|
| 2666 |
+
int64_t dst_ne[2] = {ne0, 1};
|
| 2667 |
+
size_t dst_nb[2] = {nb0, nb1};
|
| 2668 |
+
|
| 2669 |
+
// expert index
|
| 2670 |
+
int32_t i02 = *(int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
|
| 2671 |
+
GGML_ASSERT(i02 >= 0 && i02 < n_as);
|
| 2672 |
+
|
| 2673 |
+
// If B = 1 (broadcast), always use 0; otherwise, use id.
|
| 2674 |
+
int64_t i11 = (ne11 == 1 ? 0 : id);
|
| 2675 |
+
int64_t i12 = iid1;
|
| 2676 |
+
|
| 2677 |
+
int64_t i1 = id;
|
| 2678 |
+
int64_t i2 = i12;
|
| 2679 |
+
|
| 2680 |
+
void* src0_tmp_ptr = src0_original + i02*ori_src0_nb[2];
|
| 2681 |
+
void* src1_tmp_ptr = src1_original + i11*nb11 + i12*nb12;
|
| 2682 |
+
void* dst_tmp_ptr = dst_original + i1*nb1 + i2*nb2;
|
| 2683 |
+
|
| 2684 |
+
aclTensor* acl_src0 = ggml_cann_create_tensor(src0_tmp_ptr,
|
| 2685 |
+
ACL_FLOAT, sizeof(float),
|
| 2686 |
+
src0_ne, src0_nb, 2);
|
| 2687 |
+
aclTensor* acl_src1 = ggml_cann_create_tensor(src1_tmp_ptr,
|
| 2688 |
+
ACL_FLOAT, sizeof(float),
|
| 2689 |
+
src1_ne, src1_nb, 2);
|
| 2690 |
+
aclTensor* acl_dst = ggml_cann_create_tensor(dst_tmp_ptr,
|
| 2691 |
+
ACL_FLOAT, sizeof(float),
|
| 2692 |
+
dst_ne, dst_nb, 2);
|
| 2693 |
+
|
| 2694 |
+
src0_tensor_vec.push_back(acl_src0);
|
| 2695 |
+
src1_tensor_vec.push_back(acl_src1);
|
| 2696 |
+
dst_tensor_vec.push_back(acl_dst);
|
| 2697 |
+
}
|
| 2698 |
+
}
|
| 2699 |
+
|
| 2700 |
+
// GroupedMatmulV2 required tensor_list.size < 128
|
| 2701 |
+
size_t GROUP_SIZE = 128;
|
| 2702 |
+
std::vector<std::vector<aclTensor*>> src0_tensor_vec_vec;
|
| 2703 |
+
std::vector<std::vector<aclTensor*>> src1_tensor_vec_vec;
|
| 2704 |
+
std::vector<std::vector<aclTensor*>> dst_tensor_vec_vec;
|
| 2705 |
+
|
| 2706 |
+
// split and call GroupedMatmulV2
|
| 2707 |
+
for (size_t i = 0; i < src0_tensor_vec.size(); i += GROUP_SIZE) {
|
| 2708 |
+
size_t end = std::min(i + GROUP_SIZE, src0_tensor_vec.size());
|
| 2709 |
+
std::vector<aclTensor*> src0_tensor_vec_split(src0_tensor_vec.begin() + i, src0_tensor_vec.begin() + end);
|
| 2710 |
+
std::vector<aclTensor*> src1_tensor_vec_split(src1_tensor_vec.begin() + i, src1_tensor_vec.begin() + end);
|
| 2711 |
+
std::vector<aclTensor*> dst_tensor_vec_split(dst_tensor_vec.begin() + i, dst_tensor_vec.begin() + end);
|
| 2712 |
+
|
| 2713 |
+
aclTensorList* src0_tensor_list = aclCreateTensorList(src0_tensor_vec_split.data(), src0_tensor_vec_split.size());
|
| 2714 |
+
aclTensorList* src1_tensor_list = aclCreateTensorList(src1_tensor_vec_split.data(), src1_tensor_vec_split.size());
|
| 2715 |
+
aclTensorList* dst_tensor_list = aclCreateTensorList(dst_tensor_vec_split.data(), dst_tensor_vec_split.size());
|
| 2716 |
+
|
| 2717 |
+
GGML_CANN_CALL_ACLNN_OP(ctx, GroupedMatmulV2, src1_tensor_list, src0_tensor_list,
|
| 2718 |
+
nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, -1, dst_tensor_list);
|
| 2719 |
+
|
| 2720 |
+
ggml_cann_release_resources(ctx, src0_tensor_list, src1_tensor_list, dst_tensor_list);
|
| 2721 |
+
}
|
| 2722 |
+
return;
|
| 2723 |
+
}
|
| 2724 |
+
|
| 2725 |
+
void ggml_cann_mul_mat_id(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
| 2726 |
+
const enum ggml_type type = dst->src[0]->type;
|
| 2727 |
+
switch (type) {
|
| 2728 |
+
case GGML_TYPE_F32:
|
| 2729 |
+
case GGML_TYPE_F16:
|
| 2730 |
+
ggml_cann_mul_mat_id_fp(ctx, dst);
|
| 2731 |
+
break;
|
| 2732 |
+
default:
|
| 2733 |
+
GGML_ABORT("Unsupported type for mul_mat_id");
|
| 2734 |
+
break;
|
| 2735 |
+
}
|
| 2736 |
+
}
|
ggml/src/ggml-cann/aclnn_ops.h
CHANGED
|
@@ -978,6 +978,33 @@ inline void ggml_cann_async_memset(ggml_backend_cann_context & ctx, void * buffe
|
|
| 978 |
}
|
| 979 |
}
|
| 980 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 981 |
/**
|
| 982 |
* @brief Applies a element-wise operation to two input tensors using the CANN
|
| 983 |
* backend.
|
|
|
|
| 978 |
}
|
| 979 |
}
|
| 980 |
|
| 981 |
+
/**
|
| 982 |
+
* @brief Performs sparse expert-based matrix multiplication using the CANN backend.
|
| 983 |
+
*
|
| 984 |
+
* @details This function implements a MoE-style batched matrix multiplication, where each input token
|
| 985 |
+
* is routed to one or more experts, and each expert corresponds to a specific [D, M] weight matrix
|
| 986 |
+
* in the source tensor `src0`. The routing indices are provided via the `ids` tensor.
|
| 987 |
+
*
|
| 988 |
+
* For each token (from `src1`), the function selects the corresponding expert(s) as specified by `ids`,
|
| 989 |
+
* performs the matrix multiplication with the selected expert's weight submatrix (from `src0`),
|
| 990 |
+
* and stores the results in `dst`. This operation is optimized and executed on the CANN backend.
|
| 991 |
+
*
|
| 992 |
+
* Dimensions:
|
| 993 |
+
* - src0: [D, M, A, 1], where A is the number of experts
|
| 994 |
+
* - src1: [D, B, N, 1], where N is batch size and B is the slot count per sample
|
| 995 |
+
* - ids : [K, N], where K is the number of experts each token is routed to
|
| 996 |
+
* - dst : [M, K, N, 1], output tensor storing the result of expert × token multiplication
|
| 997 |
+
*
|
| 998 |
+
* The function handles two main modes:
|
| 999 |
+
* - If `ne12 == 1`, a simpler per-token loop is used.
|
| 1000 |
+
* - TODO: If `ne12 > 1`, grouped multiplication and memory copying is used for efficiency.
|
| 1001 |
+
*
|
| 1002 |
+
* @param ctx The CANN context used for operations.
|
| 1003 |
+
* @param dst The destination tensor where the expert-weighted token outputs are stored.
|
| 1004 |
+
* Expected to be of shape [M, K, N, 1].
|
| 1005 |
+
*/
|
| 1006 |
+
void ggml_cann_mul_mat_id(ggml_backend_cann_context& ctx, ggml_tensor* dst);
|
| 1007 |
+
|
| 1008 |
/**
|
| 1009 |
* @brief Applies a element-wise operation to two input tensors using the CANN
|
| 1010 |
* backend.
|
ggml/src/ggml-cann/ggml-cann.cpp
CHANGED
|
@@ -1672,7 +1672,8 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
|
|
| 1672 |
ggml_cann_mul_mat(ctx, dst);
|
| 1673 |
break;
|
| 1674 |
case GGML_OP_MUL_MAT_ID:
|
| 1675 |
-
|
|
|
|
| 1676 |
case GGML_OP_SCALE:
|
| 1677 |
ggml_cann_scale(ctx, dst);
|
| 1678 |
break;
|
|
@@ -2030,7 +2031,13 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
|
| 2030 |
}
|
| 2031 |
}
|
| 2032 |
case GGML_OP_MUL_MAT_ID:
|
| 2033 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2034 |
// embedding
|
| 2035 |
case GGML_OP_GET_ROWS: {
|
| 2036 |
switch (op->src[0]->type) {
|
|
|
|
| 1672 |
ggml_cann_mul_mat(ctx, dst);
|
| 1673 |
break;
|
| 1674 |
case GGML_OP_MUL_MAT_ID:
|
| 1675 |
+
ggml_cann_mul_mat_id(ctx, dst);
|
| 1676 |
+
break;
|
| 1677 |
case GGML_OP_SCALE:
|
| 1678 |
ggml_cann_scale(ctx, dst);
|
| 1679 |
break;
|
|
|
|
| 2031 |
}
|
| 2032 |
}
|
| 2033 |
case GGML_OP_MUL_MAT_ID:
|
| 2034 |
+
switch (op->src[0]->type) {
|
| 2035 |
+
case GGML_TYPE_F16:
|
| 2036 |
+
case GGML_TYPE_F32:
|
| 2037 |
+
return true;
|
| 2038 |
+
default:
|
| 2039 |
+
return false;
|
| 2040 |
+
}
|
| 2041 |
// embedding
|
| 2042 |
case GGML_OP_GET_ROWS: {
|
| 2043 |
switch (op->src[0]->type) {
|