llm: add server entrypoint for mllama

2025-07-24 18:20:09 +00:00 · 2024-09-25 14:37:28 -07:00 · 2024-09-25 14:37:28 -07:00 · d0c8ce5ea4
commit d0c8ce5ea4
parent 8ac915f709
2 changed files with 28 additions and 74 deletions
--- a/llm/ext_server/server.cpp
+++ b/llm/ext_server/server.cpp
@ -1032,6 +1032,18 @@ struct llama_server_context
    bool process_images(server_slot &slot) const
    {
        // Set cross attention state for mllama models
        // TODO (jmorganca): this should be provided via the API
        // TODO (jmorganca): generalize this beyond mllama models
        char arch_str[256];
        llama_model_meta_val_str(model, "general.architecture", arch_str, 256);
        if (strcmp(arch_str, "mllama") == 0) {
            // TODO (jmorganca): this should be passed in via the llama_decode api
            // or similar, maybe using the llama_batch struct
            // llama_reset_cross_attn_state(ctx);
            // llama_set_cross_attn_state(ctx, (float*)cross_attn_state);
        }
        for (slot_image &img : slot.images)
        {
            if (!img.request_encode_image)
--- a/llm/patches/0009-mllama.patch
+++ b/llm/patches/0009-mllama.patch
@ -1,4 +1,4 @@
-From c2db1ad0fc86de189959b628021a970511e9c6f9 Mon Sep 17 00:00:00 2001
+From 9935fbbf26ad4d9ca7735ec6ba4c0a206c0c8329 Mon Sep 17 00:00:00 2001
 From: jmorganca <jmorganca@gmail.com>
 Date: Tue, 24 Sep 2024 11:53:40 -0700
 Subject: [PATCH] add mllama support
@ -13,8 +13,8 @@ kv cache once per run
 remaining is to implement the cross attention mask
 ---
 include/llama.h |   5 +
- src/llama.cpp   | 514 ++++++++++++++++++++++++++++++++++++++++++++++--
+ src/llama.cpp   | 470 ++++++++++++++++++++++++++++++++++++++++++++++--
- 2 files changed, 499 insertions(+), 20 deletions(-)
+ 2 files changed, 461 insertions(+), 14 deletions(-)
 diff --git a/include/llama.h b/include/llama.h
 index bfc37e88..94ce82a4 100644
@ -33,7 +33,7 @@ index bfc37e88..94ce82a4 100644
     LLAMA_API void llama_free(struct llama_context * ctx);
 diff --git a/src/llama.cpp b/src/llama.cpp
-index b7771f53..75bbc226 100644
+index b7771f53..72a57a38 100644
 --- a/src/llama.cpp
 +++ b/src/llama.cpp
@@ -170,6 +170,7 @@ static std::string format(const char * fmt, ...) {
@ -193,25 +193,6 @@ index b7771f53..75bbc226 100644
 };
 // very similar to llama_batch,
@@ -2684,12 +2749,12 @@ struct llama_ubatch {
     uint32_t n_seq_tokens; // tokens per sequence
     uint32_t n_seqs;
 -    llama_token  *  token;    // [n_tokens]
 -    float        *  embd;     // [n_embd, n_tokens]
 -    llama_pos    *  pos;      // [n_tokens]
 -    int32_t      *  n_seq_id; // [n_seqs]
 -    llama_seq_id ** seq_id;   // [n_seqs]
 -    int8_t       *  output;   // [n_tokens]
 +    llama_token  *  token;             // [n_tokens]
 +    float        *  embd;              // [n_embd, n_tokens]
 +    llama_pos    *  pos;               // [n_tokens]
 +    int32_t      *  n_seq_id;          // [n_seqs]
 +    llama_seq_id ** seq_id;            // [n_seqs]
 +    int8_t       *  output;            // [n_tokens]
 };
 struct llama_kv_cell {
@@ -3268,6 +3333,10 @@ struct llama_context {
     // host buffer for the model output (logits and embeddings)
     ggml_backend_buffer_t buf_output = nullptr;
@ -404,48 +385,7 @@ index b7771f53..75bbc226 100644
     // note: storing RoPE-ed version of K in the KV cache
     ggml_build_forward_expand(graph, ggml_cpy(ctx, k_cur, k_cache_view));
-@@ -9625,6 +9788,40 @@ static struct ggml_tensor * llm_build_rwkv6_channel_mix(
+@@ -9743,6 +9906,7 @@ struct llm_build_context {
     return ggml_mul(ctx, r, llm_build_lora_mm(lctx, ctx, layer->channel_mix_value, k));
 }
 +
 +static void show_tensor(std::string name, ggml_tensor *t) {
 +    LLAMA_LOG_INFO("%s [%lld, %lld]\n", name.c_str(), t->ne[0], t->ne[1]);
 +
 +    int cols = int(t->ne[0]);
 +    int rows = int(t->ne[1]);
 +
 +    for(int r=0; r<3; r++) {
 +        for(int c=0; c<3; c++) {
 +            float v = ggml_get_f32_nd(t, c, r, 0, 0);
 +            LLAMA_LOG_INFO("%11.8f ", v);
 +        }
 +        LLAMA_LOG_INFO("... ");
 +        for(int c=0; c<3; c++) {
 +            float v = ggml_get_f32_nd(t, cols-3+c, r, 0, 0);
 +            LLAMA_LOG_INFO("%11.8f ", v);
 +        }
 +        LLAMA_LOG_INFO("\n");
 +    }
 +    LLAMA_LOG_INFO(" ...\n");
 +    for(int r=0; r<3; r++) {
 +        for(int c=0; c<3; c++) {
 +            float v = ggml_get_f32_nd(t, c, rows-3+r, 0, 0);
 +            LLAMA_LOG_INFO("%11.8f ", v);
 +        }
 +        LLAMA_LOG_INFO("... ");
 +        for(int c=0; c<3; c++) {
 +            float v = ggml_get_f32_nd(t, cols-3+c, rows-3+r, 0, 0);
 +            LLAMA_LOG_INFO("%11.8f ", v);
 +        }
 +        LLAMA_LOG_INFO("\n");
 +    }
 +}
 +
 struct llm_build_context {
     const llama_model    & model;
           llama_context  & lctx;
@@ -9743,6 +9940,7 @@ struct llm_build_context {
         lctx.inp_pos_bucket    = nullptr;
         lctx.inp_embd_enc      = nullptr;
         lctx.inp_KQ_mask_cross = nullptr;
@ -453,7 +393,7 @@ index b7771f53..75bbc226 100644
     }
     void free() {
-@@ -10158,6 +10356,253 @@ struct llm_build_context {
+@@ -10158,6 +10322,253 @@ struct llm_build_context {
                 LLM_NORM_RMS, cb, -1);
         cb(cur, "result_norm", -1);
@ -707,7 +647,7 @@ index b7771f53..75bbc226 100644
         // lm_head
         cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
         cb(cur, "result_output", -1);
-@@ -15493,6 +15938,10 @@ static struct ggml_cgraph * llama_build_graph(
+@@ -15493,6 +15904,10 @@ static struct ggml_cgraph * llama_build_graph(
             {
                 result = llm.build_llama();
             } break;
@ -718,7 +658,7 @@ index b7771f53..75bbc226 100644
         case LLM_ARCH_BAICHUAN:
             {
                 result = llm.build_baichuan();
-@@ -15736,7 +16185,6 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
+@@ -15736,7 +16151,6 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
     if (batch.token) {
         const int64_t n_tokens = batch.n_tokens;
@ -726,7 +666,7 @@ index b7771f53..75bbc226 100644
         ggml_backend_tensor_set(lctx.inp_tokens, batch.token, 0, n_tokens*ggml_element_size(lctx.inp_tokens));
     }
-@@ -16123,6 +16571,13 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
+@@ -16123,6 +16537,15 @@ static void llama_set_inputs(llama_context & lctx, const llama_ubatch & batch) {
             }
         }
     }
@ -734,13 +674,15 @@ index b7771f53..75bbc226 100644
 +    // TODO (jmorganca): this might copy a lot of data on every request of a
 +    // single generation even though it doesn't change, so we should
 +    // find a way to not set this more than one time per image
-+    if (lctx.cross_attn_state && lctx.inp_cross_attn_state->buffer) {
+    if (lctx.cross_attn_state &&
 +        lctx.inp_cross_attn_state &&
 +        lctx.inp_cross_attn_state->buffer) {
 +        ggml_backend_tensor_set(lctx.inp_cross_attn_state, lctx.cross_attn_state, 0, hparams.n_embd * 1601 * 4 * ggml_element_size(lctx.inp_cross_attn_state));
 +    }
 }
 // Make sure enough space is available for outputs.
-@@ -16430,6 +16885,10 @@ static int llama_decode_internal(
+@@ -16430,6 +16853,10 @@ static int llama_decode_internal(
         llama_set_inputs(lctx, ubatch);
@ -751,7 +693,7 @@ index b7771f53..75bbc226 100644
         llama_graph_compute(lctx, gf, n_threads, threadpool);
         // update the kv ring buffer
-@@ -17586,7 +18045,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
+@@ -17586,7 +18013,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         if (llama_model_has_encoder(&model)) {
             n_attn_layer *= 3;
         }
@ -762,7 +704,7 @@ index b7771f53..75bbc226 100644
     }
     size_t total_size_org = 0;
-@@ -18681,6 +19142,18 @@ struct llama_context * llama_new_context_with_model(
+@@ -18681,6 +19110,18 @@ struct llama_context * llama_new_context_with_model(
     return ctx;
 }
@ -781,7 +723,7 @@ index b7771f53..75bbc226 100644
 void llama_free(struct llama_context * ctx) {
     delete ctx;
 }
-@@ -18731,6 +19204,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
+@@ -18731,6 +19172,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
         // use what we call a normal RoPE, operating on pairs of consecutive head values
         case LLM_ARCH_LLAMA: