mirror of
https://github.com/tcsenpai/ollama.git
synced 2025-06-07 11:45:21 +00:00
integrate mllama.cpp to server.cpp
This commit is contained in:
parent
cb1118c842
commit
75a07dd8f7
2
llm/ext_server/CMakeLists.txt
vendored
2
llm/ext_server/CMakeLists.txt
vendored
@ -2,7 +2,7 @@ set(TARGET ollama_llama_server)
|
|||||||
option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
|
option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
|
||||||
set(LLAMA_SERVER_LDFLAGS $ENV{LLAMA_SERVER_LDFLAGS})
|
set(LLAMA_SERVER_LDFLAGS $ENV{LLAMA_SERVER_LDFLAGS})
|
||||||
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
||||||
add_executable(${TARGET} server.cpp utils.hpp httplib.h)
|
add_executable(${TARGET} server.cpp utils.hpp httplib.h mllama.h mllama.cpp)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_compile_definitions(${TARGET} PRIVATE
|
target_compile_definitions(${TARGET} PRIVATE
|
||||||
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
|
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
|
||||||
|
92
llm/ext_server/server.cpp
vendored
92
llm/ext_server/server.cpp
vendored
@ -27,6 +27,8 @@
|
|||||||
|
|
||||||
#include "../llava/clip.h"
|
#include "../llava/clip.h"
|
||||||
#include "../llava/llava.h"
|
#include "../llava/llava.h"
|
||||||
|
#include "mllama.h"
|
||||||
|
#include "ggml.h"
|
||||||
|
|
||||||
#include "stb_image.h"
|
#include "stb_image.h"
|
||||||
|
|
||||||
@ -162,6 +164,7 @@ struct server_slot {
|
|||||||
|
|
||||||
// multimodal
|
// multimodal
|
||||||
std::vector<slot_image> images;
|
std::vector<slot_image> images;
|
||||||
|
std::vector<float> cross_attn_state;
|
||||||
|
|
||||||
// stats
|
// stats
|
||||||
size_t n_sent_text = 0; // number of sent text character
|
size_t n_sent_text = 0; // number of sent text character
|
||||||
@ -173,8 +176,6 @@ struct server_slot {
|
|||||||
double t_prompt_processing; // ms
|
double t_prompt_processing; // ms
|
||||||
double t_token_generation; // ms
|
double t_token_generation; // ms
|
||||||
|
|
||||||
float *cross_attn_state = nullptr;
|
|
||||||
|
|
||||||
// multitasks
|
// multitasks
|
||||||
int multitask_id = -1;
|
int multitask_id = -1;
|
||||||
|
|
||||||
@ -202,11 +203,6 @@ struct server_slot {
|
|||||||
img.prefix_prompt = "";
|
img.prefix_prompt = "";
|
||||||
}
|
}
|
||||||
|
|
||||||
if (cross_attn_state) {
|
|
||||||
free(cross_attn_state);
|
|
||||||
cross_attn_state = nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
images.clear();
|
images.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -344,6 +340,7 @@ struct llama_server_context
|
|||||||
llama_context *ctx = nullptr;
|
llama_context *ctx = nullptr;
|
||||||
|
|
||||||
clip_ctx *clp_ctx = nullptr;
|
clip_ctx *clp_ctx = nullptr;
|
||||||
|
struct mllama_ctx *mllama_ctx = nullptr;
|
||||||
|
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
@ -381,6 +378,10 @@ struct llama_server_context
|
|||||||
clip_free(clp_ctx);
|
clip_free(clp_ctx);
|
||||||
clp_ctx = nullptr;
|
clp_ctx = nullptr;
|
||||||
}
|
}
|
||||||
|
if (mllama_ctx != nullptr) {
|
||||||
|
mllama_free(mllama_ctx);
|
||||||
|
mllama_ctx = nullptr;
|
||||||
|
}
|
||||||
if (ctx)
|
if (ctx)
|
||||||
{
|
{
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
@ -397,13 +398,45 @@ struct llama_server_context
|
|||||||
{
|
{
|
||||||
params = params_;
|
params = params_;
|
||||||
if (!params.mmproj.empty()) {
|
if (!params.mmproj.empty()) {
|
||||||
multimodal = true;
|
struct ggml_context *ggml_ctx = nullptr;
|
||||||
LOG_DEBUG("Multi Modal Mode Enabled", {});
|
struct gguf_context *gguf_ctx = gguf_init_from_file(params.mmproj.c_str(), {true, &ggml_ctx});
|
||||||
|
if (gguf_ctx != nullptr) {
|
||||||
|
const int arch_index = gguf_find_key(gguf_ctx, "general.architecture");
|
||||||
|
if (arch_index == -1) {
|
||||||
|
LOG_ERROR("unknown vision model architecture", {{"model", params.mmproj}});
|
||||||
|
gguf_free(gguf_ctx);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
const std::string arch = gguf_get_val_str(gguf_ctx, arch_index);
|
||||||
|
if (arch == "mllama") {
|
||||||
|
mllama_ctx = mllama_model_load(params.mmproj.c_str(), /*verbosity=*/ 1);
|
||||||
|
if (mllama_ctx == nullptr) {
|
||||||
|
LOG_ERROR("unable to load mllama model", {{"model", params.mmproj}});
|
||||||
|
gguf_free(gguf_ctx);
|
||||||
|
ggml_free(ggml_ctx);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
} else if (arch == "clip") {
|
||||||
clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/ 1);
|
clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/ 1);
|
||||||
if (clp_ctx == nullptr) {
|
if (clp_ctx == nullptr) {
|
||||||
LOG_ERROR("unable to load clip model", {{"model", params.mmproj}});
|
LOG_ERROR("unable to load clip model", {{"model", params.mmproj}});
|
||||||
|
gguf_free(gguf_ctx);
|
||||||
|
ggml_free(ggml_ctx);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
LOG_ERROR("unknown vision model architecture", {{"model", params.mmproj}});
|
||||||
|
gguf_free(gguf_ctx);
|
||||||
|
ggml_free(ggml_ctx);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
multimodal = true;
|
||||||
|
|
||||||
|
gguf_free(gguf_ctx);
|
||||||
|
ggml_free(ggml_ctx);
|
||||||
|
|
||||||
if (params.n_ctx < 2048) { // request larger context for the image embedding
|
if (params.n_ctx < 2048) { // request larger context for the image embedding
|
||||||
params.n_ctx = 2048;
|
params.n_ctx = 2048;
|
||||||
@ -420,10 +453,16 @@ struct llama_server_context
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (multimodal) {
|
if (multimodal) {
|
||||||
const int n_embd_clip = clip_n_mmproj_embd(clp_ctx);
|
int n_embd_vision = 0;
|
||||||
|
if (mllama_ctx != nullptr) {
|
||||||
|
n_embd_vision = mllama_n_embd(mllama_ctx);
|
||||||
|
} else if (clp_ctx != nullptr) {
|
||||||
|
n_embd_vision = clip_n_mmproj_embd(clp_ctx);
|
||||||
|
}
|
||||||
|
|
||||||
const int n_embd_llm = llama_n_embd(model);
|
const int n_embd_llm = llama_n_embd(model);
|
||||||
if (n_embd_clip != n_embd_llm) {
|
if (n_embd_vision != n_embd_llm) {
|
||||||
LOG_TEE("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_embd_clip, n_embd_llm);
|
LOG_TEE("%s: embedding dim of the multimodal projector (%d) is not equal to that of LLaMA (%d). Make sure that you use the correct mmproj file.\n", __func__, n_embd_vision, n_embd_llm);
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
return false;
|
return false;
|
||||||
@ -730,9 +769,6 @@ struct llama_server_context
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Check for mllama architecture, which processes images differently than llava
|
// Check for mllama architecture, which processes images differently than llava
|
||||||
char arch_str[256];
|
|
||||||
llama_model_meta_val_str(model, "general.architecture", arch_str, 256);
|
|
||||||
bool is_mllama = strcmp(arch_str, "mllama") == 0;
|
|
||||||
if (multimodal)
|
if (multimodal)
|
||||||
{
|
{
|
||||||
const auto &images_data = data.find("image_data");
|
const auto &images_data = data.find("image_data");
|
||||||
@ -742,22 +778,16 @@ struct llama_server_context
|
|||||||
{
|
{
|
||||||
const std::vector<uint8_t> image_buffer = base64_decode(img["data"].get<std::string>());
|
const std::vector<uint8_t> image_buffer = base64_decode(img["data"].get<std::string>());
|
||||||
|
|
||||||
if (is_mllama) {
|
if (mllama_ctx != nullptr) {
|
||||||
LOG_INFO("MLLAMA architecture detected, processing first image", {{"slot_id", slot->id}});
|
const auto &aspect_ratio_id = img["aspect_ratio_id"].get<int>();
|
||||||
|
|
||||||
struct clip_image_f32 *img = clip_image_f32_init();
|
struct mllama_image *img = mllama_image_init();
|
||||||
clip_image_load_from_data(image_buffer.data(), image_buffer.size(), 560, 560, 3, 4, img);
|
mllama_image_load_from_data(image_buffer.data(), image_buffer.size(), 560, 560, 3, 4, aspect_ratio_id, img);
|
||||||
|
slot->cross_attn_state.resize(mllama_n_embd_bytes(mllama_ctx));
|
||||||
const int n = clip_embd_nbytes(clp_ctx);
|
mllama_image_encode(mllama_ctx, params.cpuparams.n_threads, img, slot->cross_attn_state.data());
|
||||||
printf("%s: nbytes %d\n", __func__, n);
|
llama_set_cross_attn_state(ctx, slot->cross_attn_state.data());
|
||||||
|
|
||||||
slot->cross_attn_state = (float *)malloc(n);
|
|
||||||
printf("%s: nbytes %d image_embd: %p\n", __func__, n, slot->cross_attn_state);
|
|
||||||
clip_image_encode(clp_ctx, 1, img, slot->cross_attn_state);
|
|
||||||
llama_set_cross_attn_state(ctx, slot->cross_attn_state);
|
|
||||||
break;
|
break;
|
||||||
}
|
} else if (clp_ctx != nullptr) {
|
||||||
|
|
||||||
slot_image img_sl;
|
slot_image img_sl;
|
||||||
img_sl.id = img.count("id") != 0 ? img["id"].get<int>() : slot->images.size();
|
img_sl.id = img.count("id") != 0 ? img["id"].get<int>() : slot->images.size();
|
||||||
img_sl.img_data = clip_image_u8_init();
|
img_sl.img_data = clip_image_u8_init();
|
||||||
@ -775,6 +805,10 @@ struct llama_server_context
|
|||||||
});
|
});
|
||||||
img_sl.request_encode_image = true;
|
img_sl.request_encode_image = true;
|
||||||
slot->images.push_back(img_sl);
|
slot->images.push_back(img_sl);
|
||||||
|
} else {
|
||||||
|
LOG_ERROR("no multimodal model loaded", {{"slot_id", slot->id}});
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
// process prompt
|
// process prompt
|
||||||
// example: system prompt [img-102] user [img-103] describe [img-134] -> [{id: 102, prefix: 'system prompt '}, {id: 103, prefix: ' user '}, {id: 134, prefix: ' describe '}]}
|
// example: system prompt [img-102] user [img-103] describe [img-134] -> [{id: 102, prefix: 'system prompt '}, {id: 103, prefix: ' user '}, {id: 134, prefix: ' describe '}]}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user