From ce6fee43eea7743a5ea259625988bfb6ea2b0e9d Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Fri, 11 Oct 2024 17:01:22 -0700 Subject: [PATCH] fix cuda unpad kernel --- llama/ggml-cuda/pad.cu | 4 ++-- llm/patches/9999-unpad.patch | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/llama/ggml-cuda/pad.cu b/llama/ggml-cuda/pad.cu index d04d1b18..3a753ea0 100644 --- a/llama/ggml-cuda/pad.cu +++ b/llama/ggml-cuda/pad.cu @@ -100,9 +100,9 @@ static __global__ void unpad_f32(const float * x, float * dst, const int ne0, co static void unpad_f32_cuda(const float * x, float * dst, const int ne00, const int ne01, const int ne02, const int ne03, const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) { - int num_blocks = (ne0 + CUDA_unpad_BLOCK_SIZE - 1) / CUDA_unpad_BLOCK_SIZE; + int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE; dim3 gridDim(num_blocks, ne1, ne2*ne3); - unpad_f32<<>>(x, dst, ne0, ne00, ne01, ne02, ne03); + unpad_f32<<>>(x, dst, ne0, ne00, ne01, ne02, ne03); } void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { diff --git a/llm/patches/9999-unpad.patch b/llm/patches/9999-unpad.patch index 04ec919c..e9c3abdd 100644 --- a/llm/patches/9999-unpad.patch +++ b/llm/patches/9999-unpad.patch @@ -1,4 +1,4 @@ -From eb77f6e32425e9f18914f3dec66faf4129c30301 Mon Sep 17 00:00:00 2001 +From a749b1d9635ee48b25dba95a12fe032abf427ddc Mon Sep 17 00:00:00 2001 From: Michael Yang Date: Fri, 11 Oct 2024 16:19:43 -0700 Subject: [PATCH] add unpad operator @@ -64,7 +64,7 @@ index 8a844b02..7e4611fb 100644 case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_LEAKY_RELU: diff --git a/ggml/src/ggml-cuda/pad.cu b/ggml/src/ggml-cuda/pad.cu -index aba539e8..3d4c4ca4 100644 +index aba539e8..39fd4b16 100644 --- a/ggml/src/ggml-cuda/pad.cu +++ b/ggml/src/ggml-cuda/pad.cu @@ -47,3 +47,49 @@ void ggml_cuda_op_pad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { @@ -98,9 +98,9 @@ index aba539e8..3d4c4ca4 100644 +static void unpad_f32_cuda(const float * x, float * dst, + const int ne00, const int ne01, const int ne02, const int ne03, + const int ne0, const int ne1, const int ne2, const int ne3, cudaStream_t stream) { -+ int num_blocks = (ne0 + CUDA_unpad_BLOCK_SIZE - 1) / CUDA_unpad_BLOCK_SIZE; ++ int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE; + dim3 gridDim(num_blocks, ne1, ne2*ne3); -+ unpad_f32<<>>(x, dst, ne0, ne00, ne01, ne02, ne03); ++ unpad_f32<<>>(x, dst, ne0, ne00, ne01, ne02, ne03); +} + +void ggml_cuda_op_unpad(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {