mirror of
https://github.com/tcsenpai/ollama.git
synced 2025-06-07 11:45:21 +00:00
update server.cpp changes
This commit is contained in:
parent
d0c8ce5ea4
commit
055cb6b0e2
23
llm/ext_server/server.cpp
vendored
23
llm/ext_server/server.cpp
vendored
@ -1032,18 +1032,6 @@ struct llama_server_context
|
|||||||
|
|
||||||
bool process_images(server_slot &slot) const
|
bool process_images(server_slot &slot) const
|
||||||
{
|
{
|
||||||
// Set cross attention state for mllama models
|
|
||||||
// TODO (jmorganca): this should be provided via the API
|
|
||||||
// TODO (jmorganca): generalize this beyond mllama models
|
|
||||||
char arch_str[256];
|
|
||||||
llama_model_meta_val_str(model, "general.architecture", arch_str, 256);
|
|
||||||
if (strcmp(arch_str, "mllama") == 0) {
|
|
||||||
// TODO (jmorganca): this should be passed in via the llama_decode api
|
|
||||||
// or similar, maybe using the llama_batch struct
|
|
||||||
// llama_reset_cross_attn_state(ctx);
|
|
||||||
// llama_set_cross_attn_state(ctx, (float*)cross_attn_state);
|
|
||||||
}
|
|
||||||
|
|
||||||
for (slot_image &img : slot.images)
|
for (slot_image &img : slot.images)
|
||||||
{
|
{
|
||||||
if (!img.request_encode_image)
|
if (!img.request_encode_image)
|
||||||
@ -1258,6 +1246,17 @@ struct llama_server_context
|
|||||||
task.type = TASK_TYPE_COMPLETION;
|
task.type = TASK_TYPE_COMPLETION;
|
||||||
task.multitask_id = multitask_id;
|
task.multitask_id = multitask_id;
|
||||||
|
|
||||||
|
// Set cross attention state for mllama models
|
||||||
|
// TODO (jmorganca): this should be provided via the API
|
||||||
|
// TODO (jmorganca): generalize this beyond mllama models
|
||||||
|
char arch_str[256];
|
||||||
|
llama_model_meta_val_str(model, "general.architecture", arch_str, 256);
|
||||||
|
if (strcmp(arch_str, "mllama") == 0) {
|
||||||
|
// TODO (jmorganca): this should be passed in via the llama_decode api
|
||||||
|
// or similar, maybe using the llama_batch struct
|
||||||
|
// llama_set_cross_attn_state(ctx, (float*)cross_attn_state);
|
||||||
|
}
|
||||||
|
|
||||||
// when a completion task's prompt array is not a singleton, we split it into multiple requests
|
// when a completion task's prompt array is not a singleton, we split it into multiple requests
|
||||||
// otherwise, it's a single-prompt task, we actually queue it
|
// otherwise, it's a single-prompt task, we actually queue it
|
||||||
// if there's numbers in the prompt array it will be treated as an array of tokens
|
// if there's numbers in the prompt array it will be treated as an array of tokens
|
||||||
|
Loading…
x
Reference in New Issue
Block a user