server.cpp: cleanup cross attention state

2025-07-23 17:50:11 +00:00 · 2024-09-26 23:53:12 -07:00 · 2024-09-26 23:53:12 -07:00 · 71e76f8c90
commit 71e76f8c90
parent 7d5e0ff80e
1 changed files with 7 additions and 5 deletions
--- a/llm/ext_server/server.cpp
+++ b/llm/ext_server/server.cpp
@ -729,6 +729,10 @@ struct llama_server_context
            slot->sparams.samplers_sequence = default_sparams.samplers_sequence;
        }
        // Check for mllama architecture, which processes images differently than llava
        char arch_str[256];
        llama_model_meta_val_str(model, "general.architecture", arch_str, 256);
        bool is_mllama = strcmp(arch_str, "mllama") == 0;
        if (multimodal)
        {
            const auto &images_data = data.find("image_data");
@ -738,11 +742,6 @@ struct llama_server_context
                {
                    const std::vector<uint8_t> image_buffer = base64_decode(img["data"].get<std::string>());
                    // Check for mllama architecture, which processes images differently than llava
                    char arch_str[256];
                    llama_model_meta_val_str(model, "general.architecture", arch_str, 256);
                    bool is_mllama = strcmp(arch_str, "mllama") == 0;
                    if (is_mllama) {
                        LOG_INFO("MLLAMA architecture detected, processing first image", {{"slot_id", slot->id}});
@ -820,6 +819,8 @@ struct llama_server_context
                    slot->params.input_suffix = prompt.substr(begin_prefix);
                    slot->params.cache_prompt = false; // multimodal doesn't support cache prompt
                }
            } else {
                llama_set_cross_attn_state(ctx, nullptr);
            }
        }
@ -1496,6 +1497,7 @@ struct llama_server_context
                {
                    if (slot.task_id == task.target_id)
                    {
                        slot.reset();
                        slot.release();
                        break;
                    }