mirror of
https://github.com/tcsenpai/ollama.git
synced 2025-06-07 03:35:21 +00:00
Merge pull request #6123 from dhiltgen/go_server_unified
llama: Runtime selection of new or old runners
This commit is contained in:
commit
e584f14e78
@ -45,6 +45,8 @@ var (
|
||||
MaxQueuedRequests int
|
||||
// Set via OLLAMA_MODELS in the environment
|
||||
ModelsDir string
|
||||
// Set via OLLAMA_NEW_RUNNERS in the environment
|
||||
NewRunners bool
|
||||
// Set via OLLAMA_NOHISTORY in the environment
|
||||
NoHistory bool
|
||||
// Set via OLLAMA_NOPRUNE in the environment
|
||||
@ -88,6 +90,7 @@ func AsMap() map[string]EnvVar {
|
||||
"OLLAMA_MAX_LOADED_MODELS": {"OLLAMA_MAX_LOADED_MODELS", MaxRunners, "Maximum number of loaded models per GPU"},
|
||||
"OLLAMA_MAX_QUEUE": {"OLLAMA_MAX_QUEUE", MaxQueuedRequests, "Maximum number of queued requests"},
|
||||
"OLLAMA_MODELS": {"OLLAMA_MODELS", ModelsDir, "The path to the models directory"},
|
||||
"OLLAMA_NEW_RUNNERS": {"OLLAMA_NEW_RUNNERS", NewRunners, "Enable new experimental runners"},
|
||||
"OLLAMA_NOHISTORY": {"OLLAMA_NOHISTORY", NoHistory, "Do not preserve readline history"},
|
||||
"OLLAMA_NOPRUNE": {"OLLAMA_NOPRUNE", NoPrune, "Do not prune model blobs on startup"},
|
||||
"OLLAMA_NUM_PARALLEL": {"OLLAMA_NUM_PARALLEL", NumParallel, "Maximum number of parallel requests"},
|
||||
@ -170,8 +173,8 @@ func LoadConfig() {
|
||||
for _, root := range []string{filepath.Dir(appExe), cwd} {
|
||||
paths = append(paths,
|
||||
root,
|
||||
filepath.Join(root, "windows-"+runtime.GOARCH),
|
||||
filepath.Join(root, "dist", "windows-"+runtime.GOARCH),
|
||||
filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH),
|
||||
filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH),
|
||||
)
|
||||
}
|
||||
|
||||
@ -281,6 +284,13 @@ func LoadConfig() {
|
||||
RocrVisibleDevices = clean("ROCR_VISIBLE_DEVICES")
|
||||
GpuDeviceOrdinal = clean("GPU_DEVICE_ORDINAL")
|
||||
HsaOverrideGfxVersion = clean("HSA_OVERRIDE_GFX_VERSION")
|
||||
|
||||
if nr := clean("OLLAMA_NEW_RUNNERS"); nr != "" {
|
||||
d, err := strconv.ParseBool(nr)
|
||||
if err == nil {
|
||||
NewRunners = d
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func getModelsDir() (string, error) {
|
||||
|
@ -40,7 +40,7 @@ func TestMultiModelConcurrency(t *testing.T) {
|
||||
}
|
||||
resp = [2][]string{
|
||||
[]string{"sunlight"},
|
||||
[]string{"england", "english", "massachusetts", "pilgrims", "british"},
|
||||
[]string{"england", "english", "massachusetts", "pilgrims", "british", "festival"},
|
||||
}
|
||||
)
|
||||
var wg sync.WaitGroup
|
||||
|
@ -275,7 +275,7 @@ func DoGenerate(ctx context.Context, t *testing.T, client *api.Client, genReq ap
|
||||
break
|
||||
}
|
||||
}
|
||||
require.True(t, atLeastOne, "none of %v found in %s", anyResp, response)
|
||||
require.True(t, atLeastOne, "%s: none of %v found in %s", genReq.Model, anyResp, response)
|
||||
slog.Info("test pass", "model", genReq.Model, "prompt", genReq.Prompt, "contains", anyResp, "response", response)
|
||||
case <-ctx.Done():
|
||||
t.Error("outer test context done while waiting for generate")
|
||||
|
315
llama/Makefile
315
llama/Makefile
@ -4,41 +4,102 @@ ifeq ($(ARCH),x86_64)
|
||||
ARCH := amd64
|
||||
endif
|
||||
ifneq (,$(findstring MINGW,$(OS))$(findstring MSYS,$(OS)))
|
||||
OS := Windows
|
||||
OS := windows
|
||||
else ifeq ($(OS),Linux)
|
||||
OS := linux
|
||||
else ifeq ($(OS),Darwin)
|
||||
OS := darwin
|
||||
endif
|
||||
comma:= ,
|
||||
empty:=
|
||||
space:= $(empty) $(empty)
|
||||
|
||||
export CGO_CFLAGS_ALLOW = -mfma|-mf16c
|
||||
export CGO_CXXFLAGS_ALLOW = -mfma|-mf16c
|
||||
export HIP_PLATFORM = amd
|
||||
|
||||
SRC_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
|
||||
BUILD_DIR := $(SRC_DIR)/build/$(OS)/$(ARCH)/
|
||||
BUILD_DIR = $(SRC_DIR)build/$(OS)-$(ARCH)
|
||||
DIST_BASE = $(abspath $(SRC_DIR)/../dist/$(OS)-$(ARCH))
|
||||
RUNNERS_DIST_DIR = $(DIST_BASE)/ollama_runners
|
||||
RUNNERS_PAYLOAD_DIR = $(abspath $(SRC_DIR)/../llm/build/$(OS)/$(patsubst amd64,x86_64,$(ARCH)))
|
||||
RUNNERS_BUILD_DIR = $(BUILD_DIR)/ollama_runners
|
||||
DEFAULT_RUNNER := $(if $(and $(filter darwin,$(OS)),$(filter arm64,$(ARCH))),metal,cpu)
|
||||
|
||||
ifeq ($(OS),Windows)
|
||||
CUDA_LIBS_SHORT := cublas cudart cublasLt
|
||||
ROCM_LIBS_SHORT := hipblas rocblas
|
||||
|
||||
ifeq ($(OS),windows)
|
||||
SRC_DIR := $(shell cygpath -m -s "$(SRC_DIR)")
|
||||
OBJ_EXT := obj
|
||||
SHARED_EXT := dll
|
||||
EXE_EXT := .exe
|
||||
SHARED_PREFIX := ""
|
||||
CUDA_LIB := $(shell cygpath -w -s "$(CUDA_PATH)\lib\x64")
|
||||
HIP_LIB := $(shell cygpath -w -s "$(HIP_PATH)\lib")
|
||||
NVCC := nvcc
|
||||
# If HIP_PATH has spaces, hipcc trips over them when subprocessing
|
||||
HIP_PATH := $(shell cygpath -m -s "$(HIP_PATH)\")
|
||||
export HIP_PATH
|
||||
HIPCC := $(HIP_PATH)bin/hipcc.bin.exe
|
||||
else ifeq ($(OS),Linux)
|
||||
SHARED_PREFIX :=
|
||||
|
||||
# TODO needs work for multiple cuda versions on windows
|
||||
|
||||
CUDA_BASE_DIR := $(dir $(shell cygpath -m -s "$(CUDA_PATH)\.."))
|
||||
CUDA_11=$(shell ls -d $(CUDA_BASE_DIR)/v11.? 2>/dev/null)
|
||||
CUDA_12=$(shell ls -d $(CUDA_BASE_DIR)/v12.? 2>/dev/null)
|
||||
CUDA_11_LIB_DIR := $(CUDA_11)/bin
|
||||
CUDA_12_LIB_DIR := $(CUDA_12)/bin
|
||||
|
||||
|
||||
NVCC := $(shell X=$$(which nvcc 2>/dev/null) && cygpath -m -s "$$X")
|
||||
ifneq ($(HIP_PATH),)
|
||||
HIP_LIB_DIR := $(shell cygpath -m -s "$(HIP_PATH)\bin")
|
||||
# If HIP_PATH has spaces, hipcc trips over them when subprocessing
|
||||
HIP_PATH := $(shell cygpath -m -s "$(HIP_PATH)\")
|
||||
export HIP_PATH
|
||||
HIPCC := $(HIP_PATH)bin/hipcc.bin.exe
|
||||
endif
|
||||
CP := cp
|
||||
CUDA_LIBS = $(wildcard $(addsuffix 64*.$(SHARED_EXT),$(addprefix $(CUDA_LIB_DIR)/$(SHARED_PREFIX),$(CUDA_LIBS_SHORT))))
|
||||
else ifeq ($(OS),linux)
|
||||
CP := cp -a
|
||||
OBJ_EXT := o
|
||||
SHARED_EXT := so
|
||||
SHARED_PREFIX := lib
|
||||
CUDA_PATH?=/usr/local/cuda
|
||||
HIP_PATH?=/opt/rocm
|
||||
CUDA_LIB := "$(CUDA_PATH)/lib64"
|
||||
HIP_LIB := "$(HIP_PATH)/lib"
|
||||
NVCC := nvcc
|
||||
HIPCC := hipcc
|
||||
HIP_LIB_DIR := $(HIP_PATH)/lib
|
||||
HIPCC := $(shell X=$$(which hipcc 2>/dev/null) && echo $$X)
|
||||
CUDA_PATH?=/usr/local/cuda
|
||||
CUDA_11=$(shell ls -d $(CUDA_PATH)-11 2>/dev/null)
|
||||
CUDA_12=$(shell ls -d $(CUDA_PATH)-12 2>/dev/null)
|
||||
CUDA_11_LIB_DIR := $(CUDA_11)/lib64
|
||||
CUDA_12_LIB_DIR := $(CUDA_12)/lib64
|
||||
else
|
||||
OBJ_EXT := o
|
||||
SHARED_EXT := so
|
||||
CP := cp -a
|
||||
endif
|
||||
|
||||
CUDA_11_LIBS = $(wildcard $(addsuffix .$(SHARED_EXT).*,$(addprefix $(CUDA_11_LIB_DIR)/$(SHARED_PREFIX),$(CUDA_LIBS_SHORT))))
|
||||
CUDA_12_LIBS = $(wildcard $(addsuffix .$(SHARED_EXT).*,$(addprefix $(CUDA_12_LIB_DIR)/$(SHARED_PREFIX),$(CUDA_LIBS_SHORT))))
|
||||
NVCC_11 = $(CUDA_11)/bin/nvcc
|
||||
NVCC_12 = $(CUDA_12)/bin/nvcc
|
||||
|
||||
CUDA_DEPS_DIR = $(DIST_BASE)cuda/
|
||||
ROCM_DEPS_DIR = $(DIST_BASE)rocm/
|
||||
|
||||
ifneq ($(CUDA_11),)
|
||||
CUDA_11_VARIANT= _v11
|
||||
CUDA_11_LIB_DEPS = $(addprefix $(CUDA_DEPS_DIR),$(notdir $(CUDA_11_LIBS)))
|
||||
endif
|
||||
ifneq ($(CUDA_12),)
|
||||
CUDA_12_VARIANT= _v12
|
||||
CUDA_12_LIB_DEPS = $(addprefix $(CUDA_DEPS_DIR),$(notdir $(CUDA_12_LIBS)))
|
||||
endif
|
||||
ifeq ($(OLLAMA_SKIP_ROCM_GENERATE),)
|
||||
ifneq ($(HIPCC),)
|
||||
ROCM_VERSION := $(subst $(space),.,$(wordlist 1,2,$(subst .,$(space),$(word 3,$(subst -,$(space),$(filter HIP version: %,$(shell $(HIPCC) --version)))))))
|
||||
ifneq (,$(ROCM_VERSION))
|
||||
ROCM_VARIANT = _v$(ROCM_VERSION)
|
||||
endif
|
||||
ROCM_LIBS = $(wildcard $(addsuffix .$(SHARED_EXT),$(addprefix $(HIP_LIB_DIR)/$(SHARED_PREFIX),$(ROCM_LIBS_SHORT))))
|
||||
ROCM_LIB_DEPS = $(addprefix $(ROCM_DEPS_DIR),$(notdir $(ROCM_LIBS)))
|
||||
ROCBLAS_DEP_MANIFEST = $(ROCM_DEPS_DIR)/rocblas/library/TensileManifest.txt
|
||||
endif
|
||||
endif
|
||||
|
||||
CUDA_SRCS := \
|
||||
@ -51,21 +112,19 @@ CUDA_SRCS := \
|
||||
$(wildcard ggml-cuda/template-instances/fattn-vec*f16-f16.cu) \
|
||||
ggml.c ggml-backend.c ggml-alloc.c ggml-quants.c sgemm.cpp
|
||||
|
||||
CUDA_OBJS := $(CUDA_SRCS:.cu=.cuda.$(OBJ_EXT))
|
||||
CUDA_OBJS := $(CUDA_OBJS:.c=.cuda.$(OBJ_EXT))
|
||||
CUDA_OBJS := $(addprefix $(BUILD_DIR),$(CUDA_OBJS:.cpp=.cuda.$(OBJ_EXT)))
|
||||
CUDA_11_OBJS := $(CUDA_SRCS:.cu=.cuda.$(OBJ_EXT))
|
||||
CUDA_11_OBJS := $(CUDA_11_OBJS:.c=.cuda.$(OBJ_EXT))
|
||||
CUDA_11_OBJS := $(addprefix $(BUILD_DIR)/cuda_v11/,$(CUDA_11_OBJS:.cpp=.cuda.$(OBJ_EXT)))
|
||||
CUDA_12_OBJS := $(CUDA_SRCS:.cu=.cuda.$(OBJ_EXT))
|
||||
CUDA_12_OBJS := $(CUDA_12_OBJS:.c=.cuda.$(OBJ_EXT))
|
||||
CUDA_12_OBJS := $(addprefix $(BUILD_DIR)/cuda_v12/,$(CUDA_12_OBJS:.cpp=.cuda.$(OBJ_EXT)))
|
||||
|
||||
HIP_OBJS := $(CUDA_SRCS:.cu=.hip.$(OBJ_EXT))
|
||||
HIP_OBJS := $(HIP_OBJS:.c=.hip.$(OBJ_EXT))
|
||||
HIP_OBJS := $(addprefix $(BUILD_DIR),$(HIP_OBJS:.cpp=.hip.$(OBJ_EXT)))
|
||||
HIP_OBJS := $(addprefix $(BUILD_DIR)/,$(HIP_OBJS:.cpp=.hip.$(OBJ_EXT)))
|
||||
|
||||
CUDA_FLAGS := \
|
||||
--generate-code=arch=compute_50,code=[compute_50,sm_50] \
|
||||
--generate-code=arch=compute_52,code=[compute_52,sm_52] \
|
||||
--generate-code=arch=compute_61,code=[compute_61,sm_61] \
|
||||
--generate-code=arch=compute_70,code=[compute_70,sm_70] \
|
||||
--generate-code=arch=compute_75,code=[compute_75,sm_75] \
|
||||
--generate-code=arch=compute_80,code=[compute_80,sm_80] \
|
||||
-t4 \
|
||||
-DGGML_CUDA_DMMV_X=32 \
|
||||
-DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 \
|
||||
-DGGML_USE_CUDA=1 \
|
||||
@ -82,6 +141,34 @@ CUDA_FLAGS := \
|
||||
-I. \
|
||||
-O3
|
||||
|
||||
CUDA_11_FLAGS := \
|
||||
--generate-code=arch=compute_50,code=[compute_50,sm_50] \
|
||||
--generate-code=arch=compute_52,code=[compute_52,sm_52] \
|
||||
--generate-code=arch=compute_53,code=[compute_53,sm_53] \
|
||||
--generate-code=arch=compute_60,code=[compute_60,sm_60] \
|
||||
--generate-code=arch=compute_61,code=[compute_61,sm_61] \
|
||||
--generate-code=arch=compute_62,code=[compute_62,sm_62] \
|
||||
--generate-code=arch=compute_70,code=[compute_70,sm_70] \
|
||||
--generate-code=arch=compute_72,code=[compute_72,sm_72] \
|
||||
--generate-code=arch=compute_75,code=[compute_75,sm_75] \
|
||||
--generate-code=arch=compute_80,code=[compute_80,sm_80] \
|
||||
--generate-code=arch=compute_86,code=[compute_86,sm_86]
|
||||
|
||||
CUDA_12_FLAGS := \
|
||||
--generate-code=arch=compute_60,code=[compute_60,sm_60] \
|
||||
--generate-code=arch=compute_61,code=[compute_61,sm_61] \
|
||||
--generate-code=arch=compute_62,code=[compute_62,sm_62] \
|
||||
--generate-code=arch=compute_70,code=[compute_70,sm_70] \
|
||||
--generate-code=arch=compute_72,code=[compute_72,sm_72] \
|
||||
--generate-code=arch=compute_75,code=[compute_75,sm_75] \
|
||||
--generate-code=arch=compute_80,code=[compute_80,sm_80] \
|
||||
--generate-code=arch=compute_86,code=[compute_86,sm_86] \
|
||||
--generate-code=arch=compute_87,code=[compute_87,sm_87] \
|
||||
--generate-code=arch=compute_89,code=[compute_89,sm_89] \
|
||||
--generate-code=arch=compute_90,code=[compute_90,sm_90] \
|
||||
--generate-code=arch=compute_90a,code=[compute_90a,sm_90a] \
|
||||
-DGGML_CUDA_USE_GRAPHS=on
|
||||
|
||||
HIP_ARCHS := gfx900 gfx940 gfx941 gfx942 gfx1010 gfx1012 gfx1030 gfx1100 gfx1101 gfx1102
|
||||
LINUX_HIP_ARCHS := gfx906:xnack- gfx908:xnack- gfx90a:xnack+ gfx90a:xnack-
|
||||
|
||||
@ -111,79 +198,181 @@ HIP_FLAGS := \
|
||||
-I. \
|
||||
$(foreach arch, $(HIP_ARCHS), --offload-arch=$(arch))
|
||||
|
||||
ifeq ($(OS), Linux)
|
||||
HIP_FLAGS += $(foreach arch, $(LINUX_HIP_ARCHS), --offload-arch=$(arch)) -fPIC
|
||||
CUDA_FLAGS += -fPIC
|
||||
CFLAGS += -Xcompiler -fPIC -D_GNU_SOURCE
|
||||
CXXFLAGS += -Xcompiler -fPIC -D_GNU_SOURCE
|
||||
else ifeq ($(OS),Windows)
|
||||
ifeq ($(OS),linux)
|
||||
HIP_FLAGS += $(foreach arch, $(LINUX_HIP_ARCHS), --offload-arch=$(arch)) -fPIC -Wno-unused-function
|
||||
CUDA_FLAGS += -fPIC -Wno-unused-function
|
||||
NVCC_CFLAGS = $(CFLAGS) -Xcompiler -fPIC -D_GNU_SOURCE
|
||||
NVCC_CXXFLAGS = $(CXXFLAGS) -Xcompiler -fPIC -D_GNU_SOURCE
|
||||
HIPCC_CFLAGS = $(CFLAGS) -fPIC -D_GNU_SOURCE
|
||||
HIPCC_CXXFLAGS = $(CXXFLAGS) -fPIC -D_GNU_SOURCE
|
||||
else ifeq ($(OS),windows)
|
||||
HIP_FLAGS += -Xclang --dependent-lib=msvcrt
|
||||
CFLAGS += -D_WIN32_WINNT=0x602
|
||||
CXXFLAGS += -D_WIN32_WINNT=0x602
|
||||
NVCC_CFLAGS = $(CFLAGS)
|
||||
NVCC_CXXFLAGS = $(CXXFLAGS)
|
||||
HIPCC_CFLAGS = $(CFLAGS)
|
||||
HIPCC_CXXFLAGS = $(CXXFLAGS)
|
||||
endif
|
||||
|
||||
RUNNERS := $(BUILD_DIR)ollama_runner$(EXE_EXT)
|
||||
ifeq ($(OLLAMA_SKIP_CPU_GENERATE),)
|
||||
RUNNERS := $(DEFAULT_RUNNER)
|
||||
ifeq ($(ARCH),amd64)
|
||||
RUNNERS += $(BUILD_DIR)ollama_runner_avx$(EXE_EXT) $(BUILD_DIR)ollama_runner_avx2$(EXE_EXT)
|
||||
RUNNERS += cpu_avx cpu_avx2
|
||||
endif
|
||||
ifneq ($(NVCC),)
|
||||
RUNNERS += $(BUILD_DIR)ollama_runner_cuda$(EXE_EXT)
|
||||
endif
|
||||
ifeq ($(OLLAMA_SKIP_CUDA_GENERATE),)
|
||||
ifneq ($(CUDA_11),)
|
||||
RUNNERS += cuda_v11
|
||||
endif
|
||||
ifneq ($(CUDA_12),)
|
||||
RUNNERS += cuda_v12
|
||||
endif
|
||||
endif
|
||||
ifeq ($(OLLAMA_SKIP_ROCM_GENERATE),)
|
||||
ifneq ($(HIPCC),)
|
||||
RUNNERS += $(BUILD_DIR)ollama_runner_rocm$(EXE_EXT)
|
||||
RUNNERS += rocm$(ROCM_VARIANT)
|
||||
endif
|
||||
endif
|
||||
|
||||
runners: $(RUNNERS)
|
||||
DIST_RUNNERS = $(addprefix $(RUNNERS_DIST_DIR)/,$(addsuffix /ollama_runner$(EXE_EXT),$(RUNNERS)))
|
||||
PAYLOAD_RUNNERS = $(addprefix $(RUNNERS_PAYLOAD_DIR)/,$(addsuffix /ollama_runner$(EXE_EXT).gz,$(addsuffix /bin,$(RUNNERS))))
|
||||
BUILD_RUNNERS = $(addprefix $(RUNNERS_BUILD_DIR)/,$(addsuffix /ollama_runner$(EXE_EXT),$(RUNNERS)))
|
||||
|
||||
$(BUILD_DIR)%.cuda.$(OBJ_EXT): %.cu
|
||||
all: dist payload
|
||||
|
||||
dist: $(DIST_RUNNERS) $(ROCBLAS_DEP_MANIFEST)
|
||||
|
||||
ifeq ($(OS),windows)
|
||||
# Unused on windows as we don't cary the payloads in the go binary
|
||||
payload:
|
||||
else
|
||||
payload: $(PAYLOAD_RUNNERS)
|
||||
endif
|
||||
|
||||
runners: $(BUILD_RUNNERS)
|
||||
|
||||
$(BUILD_DIR)/cuda_v11/%.cuda.$(OBJ_EXT): %.cu
|
||||
@-mkdir -p $(dir $@)
|
||||
$(NVCC) -c $(CUDA_FLAGS) -o $@ $<
|
||||
$(NVCC_11) -c $(CUDA_FLAGS) $(CUDA_11_FLAGS) -o $@ $<
|
||||
|
||||
$(BUILD_DIR)%.cuda.$(OBJ_EXT): %.c
|
||||
$(BUILD_DIR)/cuda_v11/%.cuda.$(OBJ_EXT): %.c
|
||||
@-mkdir -p $(dir $@)
|
||||
$(NVCC) -c $(CFLAGS) -o $@ $<
|
||||
$(NVCC_11) -c $(NVCC_CFLAGS) -o $@ $<
|
||||
|
||||
$(BUILD_DIR)%.cuda.$(OBJ_EXT): %.cpp
|
||||
$(BUILD_DIR)/cuda_v11/%.cuda.$(OBJ_EXT): %.cpp
|
||||
@-mkdir -p $(dir $@)
|
||||
$(NVCC) -c $(CXXFLAGS) -o $@ $<
|
||||
$(NVCC_11) -c $(NVCC_CXXFLAGS) -o $@ $<
|
||||
|
||||
$(BUILD_DIR)$(SHARED_PREFIX)ggml_cuda.$(SHARED_EXT): $(CUDA_OBJS)
|
||||
$(BUILD_DIR)/cuda_v12/%.cuda.$(OBJ_EXT): %.cu
|
||||
@-mkdir -p $(dir $@)
|
||||
$(NVCC) --shared -lcuda -lcublas -lcudart -lcublasLt $(CUDA_FLAGS) $(CUDA_OBJS) -o $@
|
||||
$(NVCC_12) -c $(CUDA_FLAGS) $(CUDA_12_FLAGS) -o $@ $<
|
||||
|
||||
$(BUILD_DIR)%.hip.$(OBJ_EXT): %.cu
|
||||
$(BUILD_DIR)/cuda_v12/%.cuda.$(OBJ_EXT): %.c
|
||||
@-mkdir -p $(dir $@)
|
||||
$(NVCC_12) -c $(NVCC_CFLAGS) -o $@ $<
|
||||
|
||||
$(BUILD_DIR)/cuda_v12/%.cuda.$(OBJ_EXT): %.cpp
|
||||
@-mkdir -p $(dir $@)
|
||||
$(NVCC_12) -c $(NVCC_CXXFLAGS) -o $@ $<
|
||||
|
||||
$(RUNNERS_DIST_DIR)/%: $(RUNNERS_BUILD_DIR)/%
|
||||
@-mkdir -p $(dir $@)
|
||||
cp $< $@
|
||||
|
||||
$(RUNNERS_DIST_DIR)/cuda_v11/ollama_runner$(EXE_EXT): $(RUNNERS_DIST_DIR)/cuda_v11/$(SHARED_PREFIX)ggml_cuda.$(SHARED_EXT)
|
||||
$(RUNNERS_PAYLOAD_DIR)/cuda_v11/bin/ollama_runner$(EXE_EXT).gz: $(RUNNERS_PAYLOAD_DIR)/cuda_v11/bin/$(SHARED_PREFIX)ggml_cuda.$(SHARED_EXT).gz
|
||||
$(RUNNERS_DIST_DIR)/cuda_v12/ollama_runner$(EXE_EXT): $(RUNNERS_DIST_DIR)/cuda_v12/$(SHARED_PREFIX)ggml_cuda.$(SHARED_EXT)
|
||||
$(RUNNERS_PAYLOAD_DIR)/cuda_v12/bin/ollama_runner$(EXE_EXT).gz: $(RUNNERS_PAYLOAD_DIR)/cuda_v12/bin/$(SHARED_PREFIX)ggml_cuda.$(SHARED_EXT).gz
|
||||
|
||||
$(RUNNERS_BUILD_DIR)/cuda_v11/$(SHARED_PREFIX)ggml_cuda.$(SHARED_EXT): $(CUDA_11_OBJS) $(CUDA_11_LIB_DEPS)
|
||||
@-mkdir -p $(dir $@)
|
||||
$(NVCC_11) --shared -lcuda -L${CUDA_DEPS_DIR} $(foreach lib, $(CUDA_LIBS_SHORT), -l$(lib)) $(CUDA_FLAGS) $(CUDA_11_FLAGS) $(CUDA_11_OBJS) -o $@
|
||||
|
||||
$(RUNNERS_BUILD_DIR)/cuda_v12/$(SHARED_PREFIX)ggml_cuda.$(SHARED_EXT): $(CUDA_12_OBJS) $(CUDA_12_LIB_DEPS)
|
||||
@-mkdir -p $(dir $@)
|
||||
$(NVCC_12) --shared -lcuda -L${CUDA_DEPS_DIR} $(foreach lib, $(CUDA_LIBS_SHORT), -l$(lib)) $(CUDA_FLAGS) $(CUDA_12_FLAGS) $(CUDA_12_OBJS) -o $@
|
||||
|
||||
$(CUDA_11_LIB_DEPS):
|
||||
@-mkdir -p $(dir $@)
|
||||
$(CP) $(CUDA_11_LIB_DIR)/$(notdir $@)* $(dir $@)
|
||||
|
||||
$(CUDA_12_LIB_DEPS):
|
||||
@-mkdir -p $(dir $@)
|
||||
$(CP) $(CUDA_12_LIB_DIR)/$(notdir $@)* $(dir $@)
|
||||
|
||||
$(BUILD_DIR)/%.hip.$(OBJ_EXT): %.cu
|
||||
@-mkdir -p $(dir $@)
|
||||
$(HIPCC) -c $(HIP_FLAGS) -o $@ $<
|
||||
|
||||
$(BUILD_DIR)%.hip.$(OBJ_EXT): %.c
|
||||
$(BUILD_DIR)/%.hip.$(OBJ_EXT): %.c
|
||||
@-mkdir -p $(dir $@)
|
||||
$(HIPCC) -c $(CFLAGS) -o $@ $<
|
||||
$(HIPCC) -c $(HIPCC_CFLAGS) -o $@ $<
|
||||
|
||||
$(BUILD_DIR)%.hip.$(OBJ_EXT): %.cpp
|
||||
$(BUILD_DIR)/%.hip.$(OBJ_EXT): %.cpp
|
||||
@-mkdir -p $(dir $@)
|
||||
$(HIPCC) -c $(CXXFLAGS) -o $@ $<
|
||||
$(HIPCC) -c $(HIPCC_CXXFLAGS) -o $@ $<
|
||||
|
||||
$(BUILD_DIR)$(SHARED_PREFIX)ggml_hipblas.$(SHARED_EXT): $(HIP_OBJS)
|
||||
$(RUNNERS_DIST_DIR)/rocm$(ROCM_VARIANT)/ollama_runner$(EXE_EXT): $(RUNNERS_DIST_DIR)/rocm$(ROCM_VARIANT)/$(SHARED_PREFIX)ggml_hipblas.$(SHARED_EXT)
|
||||
$(RUNNERS_PAYLOAD_DIR)/rocm$(ROCM_VARIANT)/bin/ollama_runner$(EXE_EXT).gz: $(RUNNERS_PAYLOAD_DIR)/rocm$(ROCM_VARIANT)/bin/$(SHARED_PREFIX)ggml_hipblas.$(SHARED_EXT).gz
|
||||
|
||||
$(RUNNERS_BUILD_DIR)/rocm$(ROCM_VARIANT)/$(SHARED_PREFIX)ggml_hipblas.$(SHARED_EXT): $(HIP_OBJS) $(ROCM_LIB_DEPS)
|
||||
@-mkdir -p $(dir $@)
|
||||
$(HIPCC) --shared -lhipblas -lamdhip64 -lrocblas $(HIP_OBJS) -o $@
|
||||
$(HIPCC) --shared -lamdhip64 -L${ROCM_DEPS_DIR} $(foreach lib, $(ROCM_LIBS_SHORT), -l$(lib)) $(HIP_OBJS) -o $@
|
||||
|
||||
$(BUILD_DIR)ollama_runner$(EXE_EXT):
|
||||
$(ROCM_LIB_DEPS):
|
||||
@-mkdir -p $(dir $@)
|
||||
$(CP) $(HIP_LIB_DIR)/$(notdir $@)* $(dir $@)
|
||||
|
||||
$(RUNNERS_BUILD_DIR)/$(DEFAULT_RUNNER)/ollama_runner$(EXE_EXT): *.go ./runner/*.go
|
||||
@-mkdir -p $(dir $@)
|
||||
CGO_ENABLED=1 GOARCH=$(ARCH) go build -ldflags "-s -w" -o $@ ./runner
|
||||
|
||||
$(BUILD_DIR)ollama_runner_avx$(EXE_EXT):
|
||||
$(RUNNERS_BUILD_DIR)/cpu_avx/ollama_runner$(EXE_EXT): *.go ./runner/*.go
|
||||
@-mkdir -p $(dir $@)
|
||||
CGO_ENABLED=1 GOARCH=$(ARCH) go build -ldflags "-s -w" -tags avx -o $@ ./runner
|
||||
|
||||
$(BUILD_DIR)ollama_runner_avx2$(EXE_EXT):
|
||||
$(RUNNERS_BUILD_DIR)/cpu_avx2/ollama_runner$(EXE_EXT): *.go ./runner/*.go
|
||||
@-mkdir -p $(dir $@)
|
||||
CGO_ENABLED=1 GOARCH=$(ARCH) go build -ldflags "-s -w" -tags avx,avx2 -o $@ ./runner
|
||||
|
||||
$(BUILD_DIR)ollama_runner_cuda$(EXE_EXT): $(BUILD_DIR)$(SHARED_PREFIX)ggml_cuda.$(SHARED_EXT)
|
||||
CGO_ENABLED=1 GOARCH=$(ARCH) CGO_LDFLAGS=-L"$(CUDA_LIB)" go build -ldflags "-s -w" -tags avx,cuda -o $@ ./runner
|
||||
$(RUNNERS_BUILD_DIR)/cuda_v11/ollama_runner$(EXE_EXT): $(RUNNERS_BUILD_DIR)/cuda_v11/$(SHARED_PREFIX)ggml_cuda.$(SHARED_EXT) *.go ./runner/*.go
|
||||
@-mkdir -p $(dir $@)
|
||||
CGO_ENABLED=1 GOARCH=$(ARCH) CGO_LDFLAGS=-L"$(RUNNERS_BUILD_DIR)/cuda_v11/" go build -ldflags "-s -w" -tags avx,cuda -o $@ ./runner
|
||||
|
||||
$(BUILD_DIR)ollama_runner_rocm$(EXE_EXT): $(BUILD_DIR)$(SHARED_PREFIX)ggml_hipblas.$(SHARED_EXT)
|
||||
CGO_ENABLED=1 GOARCH=$(ARCH) CGO_LDFLAGS=-L"$(HIP_LIB)" go build -ldflags "-s -w" -tags avx,rocm -o $@ ./runner
|
||||
$(RUNNERS_BUILD_DIR)/cuda_v12/ollama_runner$(EXE_EXT): $(RUNNERS_BUILD_DIR)/cuda_v12/$(SHARED_PREFIX)ggml_cuda.$(SHARED_EXT) *.go ./runner/*.go
|
||||
@-mkdir -p $(dir $@)
|
||||
CGO_ENABLED=1 GOARCH=$(ARCH) CGO_LDFLAGS=-L"$(RUNNERS_BUILD_DIR)/cuda_v12/" go build -ldflags "-s -w" -tags avx,cuda -o $@ ./runner
|
||||
|
||||
$(RUNNERS_BUILD_DIR)/rocm$(ROCM_VARIANT)/ollama_runner$(EXE_EXT): $(RUNNERS_BUILD_DIR)/rocm$(ROCM_VARIANT)/$(SHARED_PREFIX)ggml_hipblas.$(SHARED_EXT) *.go ./runner/*.go
|
||||
@-mkdir -p $(dir $@)
|
||||
CGO_ENABLED=1 GOARCH=$(ARCH) CGO_LDFLAGS=-L"$(RUNNERS_BUILD_DIR)/rocm$(ROCM_VARIANT)/" go build -ldflags "-s -w" -tags avx,rocm -o $@ ./runner
|
||||
|
||||
|
||||
$(ROCBLAS_DEP_MANIFEST):
|
||||
@-mkdir -p $(dir $@)
|
||||
@echo "Copying rocblas library..."
|
||||
cd $(HIP_LIB_DIR)/rocblas/library/ && tar cf - . | (cd $(dir $@) && tar xf - )
|
||||
@echo "rocblas library copy complete"
|
||||
|
||||
|
||||
$(RUNNERS_PAYLOAD_DIR)/%/bin/ollama_runner.gz: $(RUNNERS_BUILD_DIR)/%/ollama_runner
|
||||
@-mkdir -p $(dir $@)
|
||||
gzip --best -c $< > $@
|
||||
$(RUNNERS_PAYLOAD_DIR)/cuda_v11/bin/%.gz: $(RUNNERS_BUILD_DIR)/cuda_v11/%
|
||||
@-mkdir -p $(dir $@)
|
||||
gzip --best -c $< > $@
|
||||
$(RUNNERS_PAYLOAD_DIR)/cuda_v12/bin/%.gz: $(RUNNERS_BUILD_DIR)/cuda_v12/%
|
||||
@-mkdir -p $(dir $@)
|
||||
gzip --best -c $< > $@
|
||||
$(RUNNERS_PAYLOAD_DIR)/rocm$(ROCM_VARIANT)/bin/%.gz: $(RUNNERS_BUILD_DIR)/rocm$(ROCM_VARIANT)/%
|
||||
@-mkdir -p $(dir $@)
|
||||
gzip --best -c $< > $@
|
||||
|
||||
clean:
|
||||
rm -rf $(BUILD_DIR)
|
||||
rm -rf $(BUILD_DIR) $(DIST_RUNNERS) $(PAYLOAD_RUNNERS)
|
||||
|
||||
.PHONY: runners clean ollama_runner$(EXE_EXT) ollama_runner_avx$(EXE_EXT) ollama_runner_avx2$(EXE_EXT) ollama_runner_cuda$(EXE_EXT) ollama_runner_rocm$(EXE_EXT)
|
||||
.PHONY: all dist payload runners clean $(RUNNERS)
|
||||
|
||||
# Handy debugging for make variables
|
||||
print-%:
|
||||
|
@ -10,10 +10,13 @@ import (
|
||||
"math"
|
||||
"net"
|
||||
"net/http"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/llama"
|
||||
@ -50,6 +53,12 @@ type Sequence struct {
|
||||
embeddingOnly bool
|
||||
|
||||
doneReason string
|
||||
|
||||
// Metrics
|
||||
t_start_process_prompt time.Time
|
||||
t_start_genereration time.Time
|
||||
n_decoded int
|
||||
n_prompt_tokens int
|
||||
}
|
||||
|
||||
// prompt returns true if the prompt is still being processed
|
||||
@ -59,7 +68,7 @@ func (s *Sequence) prompt() bool {
|
||||
}
|
||||
|
||||
func (s *Server) NewSequence(prompt string, numPredict int, stop []string, params *llama.SamplingParams, embedding bool) *Sequence {
|
||||
tokens, err := s.lc.Model().Tokenize(prompt, embedding, true)
|
||||
tokens, err := s.lc.Model().Tokenize(prompt, true, true)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
@ -80,12 +89,13 @@ func (s *Server) NewSequence(prompt string, numPredict int, stop []string, param
|
||||
}
|
||||
|
||||
return &Sequence{
|
||||
tokens: tokens,
|
||||
responses: make(chan string, 1),
|
||||
embedding: make(chan []float32, 1),
|
||||
samplingCtx: sc,
|
||||
embeddingOnly: embedding,
|
||||
stop: stop,
|
||||
tokens: tokens,
|
||||
n_prompt_tokens: len(tokens),
|
||||
responses: make(chan string, 1),
|
||||
embedding: make(chan []float32, 1),
|
||||
samplingCtx: sc,
|
||||
embeddingOnly: embedding,
|
||||
stop: stop,
|
||||
}
|
||||
}
|
||||
|
||||
@ -138,7 +148,7 @@ func (s *Server) run(ctx context.Context) {
|
||||
case <-ctx.Done():
|
||||
return
|
||||
default:
|
||||
slog.Info("Processing batch", "seqs", len(s.seqs))
|
||||
slog.Debug("Processing batch", "seqs", len(s.seqs))
|
||||
s.mu.Lock()
|
||||
for s.allNil() {
|
||||
s.cond.Wait() // Wait until an item is added
|
||||
@ -161,6 +171,10 @@ func (s *Server) run(ctx context.Context) {
|
||||
continue
|
||||
}
|
||||
|
||||
if seq.t_start_process_prompt.IsZero() {
|
||||
seq.t_start_process_prompt = time.Now()
|
||||
}
|
||||
|
||||
for j, t := range seq.tokens {
|
||||
// todo: make this n_batch
|
||||
if j > s.batchSize {
|
||||
@ -174,6 +188,7 @@ func (s *Server) run(ctx context.Context) {
|
||||
|
||||
err := s.lc.Decode(batch)
|
||||
if err != nil {
|
||||
slog.Error("failed to decode batch", "error", err)
|
||||
panic("Failed to decode")
|
||||
}
|
||||
|
||||
@ -207,11 +222,15 @@ func (s *Server) run(ctx context.Context) {
|
||||
token := seq.samplingCtx.Sample(s.lc, nil, seq.iBatch)
|
||||
|
||||
seq.samplingCtx.Accept(s.lc, token, true)
|
||||
seq.n_decoded += 1
|
||||
if seq.n_decoded == 1 {
|
||||
seq.t_start_genereration = time.Now()
|
||||
}
|
||||
piece := s.model.TokenToPiece(token)
|
||||
|
||||
seq.numPredicted++
|
||||
|
||||
slog.Info("sampled", "piece", piece)
|
||||
slog.Debug("sampled", "piece", piece)
|
||||
|
||||
// if it's an end of sequence token, break
|
||||
// TODO: just end this sequence
|
||||
@ -278,8 +297,26 @@ type CompletionRequest struct {
|
||||
api.Options
|
||||
}
|
||||
|
||||
type Timings struct {
|
||||
PredictedN int `json:"predicted_n"`
|
||||
PredictedMS float64 `json:"predicted_ms"`
|
||||
PromptN int `json:"prompt_n"`
|
||||
PromptMS float64 `json:"prompt_ms"`
|
||||
}
|
||||
|
||||
type CompletionResponse struct {
|
||||
Token string `json:"token"`
|
||||
Content string `json:"content"`
|
||||
Stop bool `json:"stop"`
|
||||
|
||||
Model string `json:"model,omitempty"`
|
||||
Prompt string `json:"prompt,omitempty"`
|
||||
StoppedLimit bool `json:"stopped_limit,omitempty"`
|
||||
PredictedN int `json:"predicted_n,omitempty"`
|
||||
PredictedMS float64 `json:"predicted_ms,omitempty"`
|
||||
PromptN int `json:"prompt_n,omitempty"`
|
||||
PromptMS float64 `json:"prompt_ms,omitempty"`
|
||||
|
||||
Timings Timings `json:"timings"`
|
||||
}
|
||||
|
||||
func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
|
||||
@ -326,9 +363,9 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
|
||||
s.mu.Unlock()
|
||||
|
||||
// stream the response
|
||||
for token := range seq.responses {
|
||||
for content := range seq.responses {
|
||||
if err := json.NewEncoder(w).Encode(&CompletionResponse{
|
||||
Token: token,
|
||||
Content: content,
|
||||
}); err != nil {
|
||||
log.Println("Failed to encode result:", err)
|
||||
return
|
||||
@ -342,6 +379,28 @@ func (s *Server) completion(w http.ResponseWriter, r *http.Request) {
|
||||
|
||||
flusher.Flush()
|
||||
}
|
||||
|
||||
// Send the stop
|
||||
if err := json.NewEncoder(w).Encode(&CompletionResponse{
|
||||
Stop: true,
|
||||
Timings: Timings{
|
||||
PromptN: seq.n_prompt_tokens,
|
||||
PromptMS: float64(seq.t_start_genereration.Sub(seq.t_start_process_prompt).Milliseconds()),
|
||||
PredictedN: seq.n_decoded,
|
||||
PredictedMS: float64(time.Since(seq.t_start_genereration).Milliseconds()),
|
||||
},
|
||||
}); err != nil {
|
||||
log.Println("Failed to encode result:", err)
|
||||
return
|
||||
}
|
||||
|
||||
flusher, ok := w.(http.Flusher)
|
||||
if !ok {
|
||||
http.Error(w, "Streaming not supported", http.StatusInternalServerError)
|
||||
return
|
||||
}
|
||||
|
||||
flusher.Flush()
|
||||
}
|
||||
|
||||
type EmbeddingRequest struct {
|
||||
@ -407,7 +466,6 @@ type HealthResponse struct {
|
||||
// TODO (jmorganca): is it safe to do this concurrently with decoding?
|
||||
func (s *Server) health(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
|
||||
if err := json.NewEncoder(w).Encode(&HealthResponse{
|
||||
Status: s.status,
|
||||
Progress: s.progress,
|
||||
@ -419,17 +477,63 @@ func (s *Server) health(w http.ResponseWriter, r *http.Request) {
|
||||
|
||||
func main() {
|
||||
mpath := flag.String("model", "", "Path to model binary file")
|
||||
ppath := flag.String("projector", "", "Path to projector binary file")
|
||||
ppath := flag.String("mmproj", "", "Path to projector binary file")
|
||||
parallel := flag.Int("parallel", 1, "Number of sequences to handle simultaneously")
|
||||
batchSize := flag.Int("batch-size", 512, "Batch size")
|
||||
nGpuLayers := flag.Int("num-gpu", 0, "Number of layers to offload to GPU")
|
||||
nGpuLayers := flag.Int("n-gpu-layers", 0, "Number of layers to offload to GPU")
|
||||
mainGpu := flag.Int("main-gpu", 0, "Main GPU")
|
||||
flashAttention := flag.Bool("flash-attention", false, "Enable flash attention")
|
||||
numCtx := flag.Int("num-ctx", 2048, "Context (or KV cache) size")
|
||||
flashAttention := flag.Bool("flash-attn", false, "Enable flash attention")
|
||||
numCtx := flag.Int("ctx-size", 2048, "Context (or KV cache) size")
|
||||
lpath := flag.String("lora", "", "Path to lora layer file")
|
||||
port := flag.Int("port", 8080, "Port to expose the server on")
|
||||
threads := flag.Int("threads", runtime.NumCPU(), "Number of threads to use during generation")
|
||||
|
||||
// TODO not yet implemented but wired to keep the parsing aligned
|
||||
embedding := flag.Bool("embedding", false, "enable embedding vector output (default: disabled)")
|
||||
logDisable := flag.Bool("log-disable", false, "disables logging to a file")
|
||||
verbose := flag.Bool("verbose", false, "verbose output (default: disabled)")
|
||||
f32 := flag.Bool("memory-f32", false, "use f32 instead of f16 for memory key+value (default: disabled) not recommended: doubles context memory required and no measurable increase in quality")
|
||||
noMmap := flag.Bool("no-mmap", false, "do not memory-map model (slower load but may reduce pageouts if not using mlock)")
|
||||
mlock := flag.Bool("mlock", false, "force system to keep model in RAM rather than swapping or compressing")
|
||||
tensorSplit := flag.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions")
|
||||
|
||||
flag.Parse()
|
||||
level := slog.LevelInfo
|
||||
if *verbose {
|
||||
level = slog.LevelDebug
|
||||
}
|
||||
handler := slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
|
||||
Level: level,
|
||||
AddSource: true,
|
||||
ReplaceAttr: func(_ []string, attr slog.Attr) slog.Attr {
|
||||
if attr.Key == slog.SourceKey {
|
||||
source := attr.Value.Any().(*slog.Source)
|
||||
source.File = filepath.Base(source.File)
|
||||
}
|
||||
return attr
|
||||
},
|
||||
})
|
||||
slog.SetDefault(slog.New(handler))
|
||||
|
||||
// TODO actually implement...
|
||||
if *embedding {
|
||||
slog.Warn("embeddings not yet support")
|
||||
}
|
||||
if *logDisable {
|
||||
slog.Info("ignoring --log-disable")
|
||||
}
|
||||
if *f32 {
|
||||
slog.Warn("memory-f32 not yet supported")
|
||||
}
|
||||
if *noMmap {
|
||||
slog.Warn("no-mmap not yet supported")
|
||||
}
|
||||
if *mlock {
|
||||
slog.Warn("mlock not yet supported")
|
||||
}
|
||||
if *tensorSplit != "" {
|
||||
slog.Warn("tensor-split not yet implemented")
|
||||
}
|
||||
|
||||
server := &Server{
|
||||
numCtx: *numCtx,
|
||||
@ -442,7 +546,7 @@ func main() {
|
||||
// load the model
|
||||
llama.BackendInit()
|
||||
params := llama.NewModelParams(*nGpuLayers, *mainGpu, func(progress float32) {
|
||||
slog.Info("Loading model", "progress %", math.Round(float64(progress*100)))
|
||||
slog.Debug("Loading model", "progress %", math.Round(float64(progress*100)))
|
||||
server.progress = progress
|
||||
})
|
||||
server.model = llama.LoadModelFromFile(*mpath, params)
|
||||
@ -475,7 +579,7 @@ func main() {
|
||||
defer listener.Close()
|
||||
|
||||
mux := http.NewServeMux()
|
||||
mux.HandleFunc("/embeddings", server.embeddings)
|
||||
mux.HandleFunc("/embedding", server.embeddings)
|
||||
mux.HandleFunc("/completion", server.completion)
|
||||
mux.HandleFunc("/health", server.health)
|
||||
|
||||
@ -483,7 +587,7 @@ func main() {
|
||||
Handler: mux,
|
||||
}
|
||||
|
||||
server.status = "ready"
|
||||
server.status = "ok"
|
||||
|
||||
log.Println("Server listening on", addr)
|
||||
if err := httpServer.Serve(listener); err != nil {
|
||||
|
@ -18,7 +18,7 @@ sign() {
|
||||
fi
|
||||
}
|
||||
|
||||
COMMON_DARWIN_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DGGML_METAL_EMBED_LIBRARY=on -DGGML_OPENMP=off"
|
||||
COMMON_DARWIN_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DGGML_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DGGML_METAL_EMBED_LIBRARY=on -DGGML_OPENMP=off"
|
||||
|
||||
case "${GOARCH}" in
|
||||
"amd64")
|
||||
|
66
llm/llm.go
66
llm/llm.go
@ -12,6 +12,7 @@ package llm
|
||||
import "C"
|
||||
import (
|
||||
"fmt"
|
||||
"log/slog"
|
||||
"unsafe"
|
||||
)
|
||||
|
||||
@ -37,3 +38,68 @@ func Quantize(infile, outfile string, ftype fileType) error {
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
type loadedModel struct {
|
||||
model *C.struct_llama_model
|
||||
}
|
||||
|
||||
func loadModel(modelfile string, vocabOnly bool) (*loadedModel, error) {
|
||||
// TODO figure out how to quiet down the logging so we don't have 2 copies of the model metadata showing up
|
||||
slog.Info("XXX initializing default model params")
|
||||
params := C.llama_model_default_params()
|
||||
params.vocab_only = C.bool(vocabOnly)
|
||||
|
||||
cmodelfile := C.CString(modelfile)
|
||||
defer C.free(unsafe.Pointer(cmodelfile))
|
||||
|
||||
slog.Info("XXX loading model", "model", modelfile)
|
||||
model := C.llama_load_model_from_file(cmodelfile, params)
|
||||
if model == nil {
|
||||
return nil, fmt.Errorf("failed to load model %s", modelfile)
|
||||
}
|
||||
return &loadedModel{model}, nil
|
||||
}
|
||||
|
||||
func freeModel(model *loadedModel) {
|
||||
C.llama_free_model(model.model)
|
||||
}
|
||||
|
||||
func tokenize(model *loadedModel, content string) ([]int, error) {
|
||||
ccontent := C.CString(content)
|
||||
defer C.free(unsafe.Pointer(ccontent))
|
||||
|
||||
len := len(content) + 2
|
||||
tokens := make([]C.int32_t, len)
|
||||
|
||||
tokenCount := C.llama_tokenize(model.model, ccontent, C.int32_t(len), &tokens[0], C.int32_t(len), true, true)
|
||||
if tokenCount < 0 {
|
||||
slog.Info("XXX got negative response", "count", tokenCount)
|
||||
tokens = make([]C.int32_t, int(tokenCount))
|
||||
tokenCount = C.llama_tokenize(model.model, ccontent, C.int32_t(len), &tokens[0], tokenCount, true, true)
|
||||
} else if tokenCount == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
ret := make([]int, tokenCount)
|
||||
for i := range int(tokenCount) {
|
||||
ret[i] = int(tokens[i])
|
||||
}
|
||||
slog.Debug("XXX tokenized", "tokens", tokens, "content", content)
|
||||
return ret, nil
|
||||
}
|
||||
|
||||
func detokenize(model *loadedModel, tokens []int) string {
|
||||
slog.Info("XXX in CGO detokenize")
|
||||
var resp string
|
||||
for _, token := range tokens {
|
||||
buf := make([]C.char, 8)
|
||||
nTokens := C.llama_token_to_piece(model.model, C.int(token), &buf[0], 8, 0, true)
|
||||
if nTokens < 0 {
|
||||
buf = make([]C.char, -nTokens)
|
||||
nTokens = C.llama_token_to_piece(model.model, C.int(token), &buf[0], -nTokens, 0, true)
|
||||
}
|
||||
tokString := C.GoStringN(&buf[0], nTokens)
|
||||
resp += tokString
|
||||
}
|
||||
slog.Debug("XXX detokenized", "tokens", tokens, "content", resp)
|
||||
return resp
|
||||
}
|
||||
|
112
llm/server.go
112
llm/server.go
@ -49,6 +49,7 @@ type llmServer struct {
|
||||
done chan error // Channel to signal when the process exits
|
||||
status *StatusWriter
|
||||
options api.Options
|
||||
model *loadedModel
|
||||
|
||||
estimate MemoryEstimate
|
||||
totalLayers uint64
|
||||
@ -256,9 +257,10 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
||||
params = append(params, "--mlock")
|
||||
}
|
||||
|
||||
if opts.UseNUMA {
|
||||
params = append(params, "--numa")
|
||||
}
|
||||
// TODO - fully unwind this - numa is no longer a boolean flag
|
||||
// if opts.UseNUMA {
|
||||
// params = append(params, "--numa")
|
||||
// }
|
||||
|
||||
params = append(params, "--parallel", fmt.Sprintf("%d", numParallel))
|
||||
|
||||
@ -316,6 +318,9 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
||||
}
|
||||
|
||||
server := filepath.Join(dir, "ollama_llama_server")
|
||||
if envconfig.NewRunners {
|
||||
server = filepath.Join(dir, "ollama_runner")
|
||||
}
|
||||
if runtime.GOOS == "windows" {
|
||||
server += ".exe"
|
||||
}
|
||||
@ -323,6 +328,9 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
||||
// Detect tmp cleaners wiping out the file
|
||||
_, err := os.Stat(server)
|
||||
if errors.Is(err, os.ErrNotExist) {
|
||||
if envconfig.NewRunners {
|
||||
return nil, fmt.Errorf("experimental runners enabled, but not present in this build")
|
||||
}
|
||||
slog.Warn("llama server disappeared, reinitializing payloads", "path", server, "error", err)
|
||||
err = Init()
|
||||
if err != nil {
|
||||
@ -331,11 +339,16 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
||||
}
|
||||
}
|
||||
|
||||
m, err := loadModel(model, true)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("unable to load model for tokenization %w", err)
|
||||
}
|
||||
s := &llmServer{
|
||||
port: port,
|
||||
cmd: exec.Command(server, finalParams...),
|
||||
status: NewStatusWriter(os.Stderr),
|
||||
options: opts,
|
||||
model: m,
|
||||
estimate: estimate,
|
||||
sem: semaphore.NewWeighted(int64(numParallel)),
|
||||
totalLayers: ggml.KV().BlockCount() + 1,
|
||||
@ -809,16 +822,16 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
|
||||
continue
|
||||
}
|
||||
|
||||
// slog.Debug("got line", "line", string(line))
|
||||
evt, ok := bytes.CutPrefix(line, []byte("data: "))
|
||||
if !ok {
|
||||
return fmt.Errorf("error parsing llm response stream: %s", line)
|
||||
evt = line
|
||||
}
|
||||
|
||||
var c completion
|
||||
if err := json.Unmarshal(evt, &c); err != nil {
|
||||
return fmt.Errorf("error unmarshalling llm prediction response: %v", err)
|
||||
}
|
||||
|
||||
switch {
|
||||
case strings.TrimSpace(c.Content) == lastToken:
|
||||
tokenRepeat++
|
||||
@ -920,7 +933,7 @@ func (s *llmServer) Embed(ctx context.Context, input []string) ([][]float32, err
|
||||
}
|
||||
|
||||
if resp.StatusCode >= 400 {
|
||||
log.Printf("llm encode error: %s", body)
|
||||
log.Printf("llm embedding error: %s", body)
|
||||
return nil, fmt.Errorf("%s", body)
|
||||
}
|
||||
|
||||
@ -941,47 +954,7 @@ type TokenizeResponse struct {
|
||||
}
|
||||
|
||||
func (s *llmServer) Tokenize(ctx context.Context, content string) ([]int, error) {
|
||||
// Make sure the server is ready
|
||||
status, err := s.getServerStatus(ctx)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
} else if status != ServerStatusReady && status != ServerStatusNoSlotsAvailable {
|
||||
return nil, fmt.Errorf("unexpected server status: %s", status.ToString())
|
||||
}
|
||||
|
||||
data, err := json.Marshal(TokenizeRequest{Content: content})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("marshaling encode data: %w", err)
|
||||
}
|
||||
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodPost, fmt.Sprintf("http://127.0.0.1:%d/tokenize", s.port), bytes.NewBuffer(data))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("encode request: %w", err)
|
||||
}
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
|
||||
resp, err := http.DefaultClient.Do(req)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("do encode request: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("read encode request: %w", err)
|
||||
}
|
||||
|
||||
if resp.StatusCode >= 400 {
|
||||
log.Printf("llm encode error: %s", body)
|
||||
return nil, fmt.Errorf("%s", body)
|
||||
}
|
||||
|
||||
var encoded TokenizeResponse
|
||||
if err := json.Unmarshal(body, &encoded); err != nil {
|
||||
return nil, fmt.Errorf("unmarshal encode response: %w", err)
|
||||
}
|
||||
|
||||
return encoded.Tokens, nil
|
||||
return tokenize(s.model, content)
|
||||
}
|
||||
|
||||
type DetokenizeRequest struct {
|
||||
@ -993,50 +966,13 @@ type DetokenizeResponse struct {
|
||||
}
|
||||
|
||||
func (s *llmServer) Detokenize(ctx context.Context, tokens []int) (string, error) {
|
||||
// Make sure the server is ready
|
||||
status, err := s.getServerStatus(ctx)
|
||||
if err != nil {
|
||||
return "", err
|
||||
} else if status != ServerStatusReady && status != ServerStatusNoSlotsAvailable {
|
||||
return "", fmt.Errorf("unexpected server status: %s", status.ToString())
|
||||
}
|
||||
|
||||
data, err := json.Marshal(DetokenizeRequest{Tokens: tokens})
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("marshaling decode data: %w", err)
|
||||
}
|
||||
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodPost, fmt.Sprintf("http://127.0.0.1:%d/detokenize", s.port), bytes.NewBuffer(data))
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("decode request: %w", err)
|
||||
}
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
|
||||
resp, err := http.DefaultClient.Do(req)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("do decode request: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("read decode request: %w", err)
|
||||
}
|
||||
|
||||
if resp.StatusCode >= 400 {
|
||||
log.Printf("llm decode error: %s", body)
|
||||
return "", fmt.Errorf("%s", body)
|
||||
}
|
||||
|
||||
var decoded DetokenizeResponse
|
||||
if err := json.Unmarshal(body, &decoded); err != nil {
|
||||
return "", fmt.Errorf("unmarshal encode response: %w", err)
|
||||
}
|
||||
|
||||
return decoded.Content, nil
|
||||
return detokenize(s.model, tokens), nil
|
||||
}
|
||||
|
||||
func (s *llmServer) Close() error {
|
||||
if s.model != nil {
|
||||
freeModel(s.model)
|
||||
}
|
||||
if s.cmd != nil {
|
||||
slog.Debug("stopping llama server")
|
||||
if err := s.cmd.Process.Kill(); err != nil {
|
||||
|
Loading…
x
Reference in New Issue
Block a user