mirror of
https://github.com/tcsenpai/ollama.git
synced 2025-06-07 11:45:21 +00:00
Runtime selection of new or old runners
This adjusts the new runners to comingle with existing runners so we can use an env var to toggle the new runners on.
This commit is contained in:
parent
8527028bf4
commit
751009a5d7
@ -140,6 +140,8 @@ var (
|
|||||||
SchedSpread = Bool("OLLAMA_SCHED_SPREAD")
|
SchedSpread = Bool("OLLAMA_SCHED_SPREAD")
|
||||||
// IntelGPU enables experimental Intel GPU detection.
|
// IntelGPU enables experimental Intel GPU detection.
|
||||||
IntelGPU = Bool("OLLAMA_INTEL_GPU")
|
IntelGPU = Bool("OLLAMA_INTEL_GPU")
|
||||||
|
// Set via OLLAMA_NEW_RUNNERS in the environment
|
||||||
|
NewRunners = Bool("OLLAMA_NEW_RUNNERS")
|
||||||
)
|
)
|
||||||
|
|
||||||
func String(s string) func() string {
|
func String(s string) func() string {
|
||||||
@ -250,6 +252,7 @@ func AsMap() map[string]EnvVar {
|
|||||||
"OLLAMA_NOHISTORY": {"OLLAMA_NOHISTORY", NoHistory(), "Do not preserve readline history"},
|
"OLLAMA_NOHISTORY": {"OLLAMA_NOHISTORY", NoHistory(), "Do not preserve readline history"},
|
||||||
"OLLAMA_NOPRUNE": {"OLLAMA_NOPRUNE", NoPrune(), "Do not prune model blobs on startup"},
|
"OLLAMA_NOPRUNE": {"OLLAMA_NOPRUNE", NoPrune(), "Do not prune model blobs on startup"},
|
||||||
"OLLAMA_NUM_PARALLEL": {"OLLAMA_NUM_PARALLEL", NumParallel(), "Maximum number of parallel requests"},
|
"OLLAMA_NUM_PARALLEL": {"OLLAMA_NUM_PARALLEL", NumParallel(), "Maximum number of parallel requests"},
|
||||||
|
"OLLAMA_NEW_RUNNERS": {"OLLAMA_NEW_RUNNERS", NewRunners(), "Enable new experimental runners"},
|
||||||
"OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", Origins(), "A comma separated list of allowed origins"},
|
"OLLAMA_ORIGINS": {"OLLAMA_ORIGINS", Origins(), "A comma separated list of allowed origins"},
|
||||||
"OLLAMA_RUNNERS_DIR": {"OLLAMA_RUNNERS_DIR", RunnersDir(), "Location for runners"},
|
"OLLAMA_RUNNERS_DIR": {"OLLAMA_RUNNERS_DIR", RunnersDir(), "Location for runners"},
|
||||||
"OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"},
|
"OLLAMA_SCHED_SPREAD": {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"},
|
||||||
|
315
llama/Makefile
315
llama/Makefile
@ -4,41 +4,102 @@ ifeq ($(ARCH),x86_64)
|
|||||||
ARCH := amd64
|
ARCH := amd64
|
||||||
endif
|
endif
|
||||||
ifneq (,$(findstring MINGW,$(OS))$(findstring MSYS,$(OS)))
|
ifneq (,$(findstring MINGW,$(OS))$(findstring MSYS,$(OS)))
|
||||||
OS := Windows
|
OS := windows
|
||||||
|
else ifeq ($(OS),Linux)
|
||||||
|
OS := linux
|
||||||
|
else ifeq ($(OS),Darwin)
|
||||||
|
OS := darwin
|
||||||
endif
|
endif
|
||||||
|
comma:= ,
|
||||||
|
empty:=
|
||||||
|
space:= $(empty) $(empty)
|
||||||
|
|
||||||
export CGO_CFLAGS_ALLOW = -mfma|-mf16c
|
export CGO_CFLAGS_ALLOW = -mfma|-mf16c
|
||||||
export CGO_CXXFLAGS_ALLOW = -mfma|-mf16c
|
export CGO_CXXFLAGS_ALLOW = -mfma|-mf16c
|
||||||
export HIP_PLATFORM = amd
|
export HIP_PLATFORM = amd
|
||||||
|
|
||||||
SRC_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
|
SRC_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
|
||||||
BUILD_DIR := $(SRC_DIR)/build/$(OS)/$(ARCH)/
|
BUILD_DIR = $(SRC_DIR)build/$(OS)-$(ARCH)
|
||||||
|
DIST_BASE = $(abspath $(SRC_DIR)/../dist/$(OS)-$(ARCH))
|
||||||
|
RUNNERS_DIST_DIR = $(DIST_BASE)/ollama_runners
|
||||||
|
RUNNERS_PAYLOAD_DIR = $(abspath $(SRC_DIR)/../llm/build/$(OS)/$(patsubst amd64,x86_64,$(ARCH)))
|
||||||
|
RUNNERS_BUILD_DIR = $(BUILD_DIR)/ollama_runners
|
||||||
|
DEFAULT_RUNNER := $(if $(and $(filter darwin,$(OS)),$(filter arm64,$(ARCH))),metal,cpu)
|
||||||
|
|
||||||
ifeq ($(OS),Windows)
|
CUDA_LIBS_SHORT := cublas cudart cublasLt
|
||||||
|
ROCM_LIBS_SHORT := hipblas rocblas
|
||||||
|
|
||||||
|
ifeq ($(OS),windows)
|
||||||
|
SRC_DIR := $(shell cygpath -m -s "$(SRC_DIR)")
|
||||||
OBJ_EXT := obj
|
OBJ_EXT := obj
|
||||||
SHARED_EXT := dll
|
SHARED_EXT := dll
|
||||||
EXE_EXT := .exe
|
EXE_EXT := .exe
|
||||||
SHARED_PREFIX := ""
|
SHARED_PREFIX :=
|
||||||
CUDA_LIB := $(shell cygpath -w -s "$(CUDA_PATH)\lib\x64")
|
|
||||||
HIP_LIB := $(shell cygpath -w -s "$(HIP_PATH)\lib")
|
# TODO needs work for multiple cuda versions on windows
|
||||||
NVCC := nvcc
|
|
||||||
# If HIP_PATH has spaces, hipcc trips over them when subprocessing
|
CUDA_BASE_DIR := $(dir $(shell cygpath -m -s "$(CUDA_PATH)\.."))
|
||||||
HIP_PATH := $(shell cygpath -m -s "$(HIP_PATH)\")
|
CUDA_11=$(shell ls -d $(CUDA_BASE_DIR)/v11.? 2>/dev/null)
|
||||||
export HIP_PATH
|
CUDA_12=$(shell ls -d $(CUDA_BASE_DIR)/v12.? 2>/dev/null)
|
||||||
HIPCC := $(HIP_PATH)bin/hipcc.bin.exe
|
CUDA_11_LIB_DIR := $(CUDA_11)/bin
|
||||||
else ifeq ($(OS),Linux)
|
CUDA_12_LIB_DIR := $(CUDA_12)/bin
|
||||||
|
|
||||||
|
|
||||||
|
NVCC := $(shell X=$$(which nvcc 2>/dev/null) && cygpath -m -s "$$X")
|
||||||
|
ifneq ($(HIP_PATH),)
|
||||||
|
HIP_LIB_DIR := $(shell cygpath -m -s "$(HIP_PATH)\bin")
|
||||||
|
# If HIP_PATH has spaces, hipcc trips over them when subprocessing
|
||||||
|
HIP_PATH := $(shell cygpath -m -s "$(HIP_PATH)\")
|
||||||
|
export HIP_PATH
|
||||||
|
HIPCC := $(HIP_PATH)bin/hipcc.bin.exe
|
||||||
|
endif
|
||||||
|
CP := cp
|
||||||
|
CUDA_LIBS = $(wildcard $(addsuffix 64*.$(SHARED_EXT),$(addprefix $(CUDA_LIB_DIR)/$(SHARED_PREFIX),$(CUDA_LIBS_SHORT))))
|
||||||
|
else ifeq ($(OS),linux)
|
||||||
|
CP := cp -a
|
||||||
OBJ_EXT := o
|
OBJ_EXT := o
|
||||||
SHARED_EXT := so
|
SHARED_EXT := so
|
||||||
SHARED_PREFIX := lib
|
SHARED_PREFIX := lib
|
||||||
CUDA_PATH?=/usr/local/cuda
|
|
||||||
HIP_PATH?=/opt/rocm
|
HIP_PATH?=/opt/rocm
|
||||||
CUDA_LIB := "$(CUDA_PATH)/lib64"
|
HIP_LIB_DIR := $(HIP_PATH)/lib
|
||||||
HIP_LIB := "$(HIP_PATH)/lib"
|
HIPCC := $(shell X=$$(which hipcc 2>/dev/null) && echo $$X)
|
||||||
NVCC := nvcc
|
CUDA_PATH?=/usr/local/cuda
|
||||||
HIPCC := hipcc
|
CUDA_11=$(shell ls -d $(CUDA_PATH)-11 2>/dev/null)
|
||||||
|
CUDA_12=$(shell ls -d $(CUDA_PATH)-12 2>/dev/null)
|
||||||
|
CUDA_11_LIB_DIR := $(CUDA_11)/lib64
|
||||||
|
CUDA_12_LIB_DIR := $(CUDA_12)/lib64
|
||||||
else
|
else
|
||||||
OBJ_EXT := o
|
OBJ_EXT := o
|
||||||
SHARED_EXT := so
|
SHARED_EXT := so
|
||||||
|
CP := cp -a
|
||||||
|
endif
|
||||||
|
|
||||||
|
CUDA_11_LIBS = $(wildcard $(addsuffix .$(SHARED_EXT).*,$(addprefix $(CUDA_11_LIB_DIR)/$(SHARED_PREFIX),$(CUDA_LIBS_SHORT))))
|
||||||
|
CUDA_12_LIBS = $(wildcard $(addsuffix .$(SHARED_EXT).*,$(addprefix $(CUDA_12_LIB_DIR)/$(SHARED_PREFIX),$(CUDA_LIBS_SHORT))))
|
||||||
|
NVCC_11 = $(CUDA_11)/bin/nvcc
|
||||||
|
NVCC_12 = $(CUDA_12)/bin/nvcc
|
||||||
|
|
||||||
|
CUDA_DEPS_DIR = $(DIST_BASE)cuda/
|
||||||
|
ROCM_DEPS_DIR = $(DIST_BASE)rocm/
|
||||||
|
|
||||||
|
ifneq ($(CUDA_11),)
|
||||||
|
CUDA_11_VARIANT= _v11
|
||||||
|
CUDA_11_LIB_DEPS = $(addprefix $(CUDA_DEPS_DIR),$(notdir $(CUDA_11_LIBS)))
|
||||||
|
endif
|
||||||
|
ifneq ($(CUDA_12),)
|
||||||
|
CUDA_12_VARIANT= _v12
|
||||||
|
CUDA_12_LIB_DEPS = $(addprefix $(CUDA_DEPS_DIR),$(notdir $(CUDA_12_LIBS)))
|
||||||
|
endif
|
||||||
|
ifeq ($(OLLAMA_SKIP_ROCM_GENERATE),)
|
||||||
|
ifneq ($(HIPCC),)
|
||||||
|
ROCM_VERSION := $(subst $(space),.,$(wordlist 1,2,$(subst .,$(space),$(word 3,$(subst -,$(space),$(filter HIP version: %,$(shell $(HIPCC) --version)))))))
|
||||||
|
ifneq (,$(ROCM_VERSION))
|
||||||
|
ROCM_VARIANT = _v$(ROCM_VERSION)
|
||||||
|
endif
|
||||||
|
ROCM_LIBS = $(wildcard $(addsuffix .$(SHARED_EXT),$(addprefix $(HIP_LIB_DIR)/$(SHARED_PREFIX),$(ROCM_LIBS_SHORT))))
|
||||||
|
ROCM_LIB_DEPS = $(addprefix $(ROCM_DEPS_DIR),$(notdir $(ROCM_LIBS)))
|
||||||
|
ROCBLAS_DEP_MANIFEST = $(ROCM_DEPS_DIR)/rocblas/library/TensileManifest.txt
|
||||||
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
CUDA_SRCS := \
|
CUDA_SRCS := \
|
||||||
@ -51,21 +112,19 @@ CUDA_SRCS := \
|
|||||||
$(wildcard ggml-cuda/template-instances/fattn-vec*f16-f16.cu) \
|
$(wildcard ggml-cuda/template-instances/fattn-vec*f16-f16.cu) \
|
||||||
ggml.c ggml-backend.c ggml-alloc.c ggml-quants.c sgemm.cpp
|
ggml.c ggml-backend.c ggml-alloc.c ggml-quants.c sgemm.cpp
|
||||||
|
|
||||||
CUDA_OBJS := $(CUDA_SRCS:.cu=.cuda.$(OBJ_EXT))
|
CUDA_11_OBJS := $(CUDA_SRCS:.cu=.cuda.$(OBJ_EXT))
|
||||||
CUDA_OBJS := $(CUDA_OBJS:.c=.cuda.$(OBJ_EXT))
|
CUDA_11_OBJS := $(CUDA_11_OBJS:.c=.cuda.$(OBJ_EXT))
|
||||||
CUDA_OBJS := $(addprefix $(BUILD_DIR),$(CUDA_OBJS:.cpp=.cuda.$(OBJ_EXT)))
|
CUDA_11_OBJS := $(addprefix $(BUILD_DIR)/cuda_v11/,$(CUDA_11_OBJS:.cpp=.cuda.$(OBJ_EXT)))
|
||||||
|
CUDA_12_OBJS := $(CUDA_SRCS:.cu=.cuda.$(OBJ_EXT))
|
||||||
|
CUDA_12_OBJS := $(CUDA_12_OBJS:.c=.cuda.$(OBJ_EXT))
|
||||||
|
CUDA_12_OBJS := $(addprefix $(BUILD_DIR)/cuda_v12/,$(CUDA_12_OBJS:.cpp=.cuda.$(OBJ_EXT)))
|
||||||
|
|
||||||
HIP_OBJS := $(CUDA_SRCS:.cu=.hip.$(OBJ_EXT))
|
HIP_OBJS := $(CUDA_SRCS:.cu=.hip.$(OBJ_EXT))
|
||||||
HIP_OBJS := $(HIP_OBJS:.c=.hip.$(OBJ_EXT))
|
HIP_OBJS := $(HIP_OBJS:.c=.hip.$(OBJ_EXT))
|
||||||
HIP_OBJS := $(addprefix $(BUILD_DIR),$(HIP_OBJS:.cpp=.hip.$(OBJ_EXT)))
|
HIP_OBJS := $(addprefix $(BUILD_DIR)/,$(HIP_OBJS:.cpp=.hip.$(OBJ_EXT)))
|
||||||
|
|
||||||
CUDA_FLAGS := \
|
CUDA_FLAGS := \
|
||||||
--generate-code=arch=compute_50,code=[compute_50,sm_50] \
|
-t4 \
|
||||||
--generate-code=arch=compute_52,code=[compute_52,sm_52] \
|
|
||||||
--generate-code=arch=compute_61,code=[compute_61,sm_61] \
|
|
||||||
--generate-code=arch=compute_70,code=[compute_70,sm_70] \
|
|
||||||
--generate-code=arch=compute_75,code=[compute_75,sm_75] \
|
|
||||||
--generate-code=arch=compute_80,code=[compute_80,sm_80] \
|
|
||||||
-DGGML_CUDA_DMMV_X=32 \
|
-DGGML_CUDA_DMMV_X=32 \
|
||||||
-DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 \
|
-DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 \
|
||||||
-DGGML_USE_CUDA=1 \
|
-DGGML_USE_CUDA=1 \
|
||||||
@ -82,6 +141,34 @@ CUDA_FLAGS := \
|
|||||||
-I. \
|
-I. \
|
||||||
-O3
|
-O3
|
||||||
|
|
||||||
|
CUDA_11_FLAGS := \
|
||||||
|
--generate-code=arch=compute_50,code=[compute_50,sm_50] \
|
||||||
|
--generate-code=arch=compute_52,code=[compute_52,sm_52] \
|
||||||
|
--generate-code=arch=compute_53,code=[compute_53,sm_53] \
|
||||||
|
--generate-code=arch=compute_60,code=[compute_60,sm_60] \
|
||||||
|
--generate-code=arch=compute_61,code=[compute_61,sm_61] \
|
||||||
|
--generate-code=arch=compute_62,code=[compute_62,sm_62] \
|
||||||
|
--generate-code=arch=compute_70,code=[compute_70,sm_70] \
|
||||||
|
--generate-code=arch=compute_72,code=[compute_72,sm_72] \
|
||||||
|
--generate-code=arch=compute_75,code=[compute_75,sm_75] \
|
||||||
|
--generate-code=arch=compute_80,code=[compute_80,sm_80] \
|
||||||
|
--generate-code=arch=compute_86,code=[compute_86,sm_86]
|
||||||
|
|
||||||
|
CUDA_12_FLAGS := \
|
||||||
|
--generate-code=arch=compute_60,code=[compute_60,sm_60] \
|
||||||
|
--generate-code=arch=compute_61,code=[compute_61,sm_61] \
|
||||||
|
--generate-code=arch=compute_62,code=[compute_62,sm_62] \
|
||||||
|
--generate-code=arch=compute_70,code=[compute_70,sm_70] \
|
||||||
|
--generate-code=arch=compute_72,code=[compute_72,sm_72] \
|
||||||
|
--generate-code=arch=compute_75,code=[compute_75,sm_75] \
|
||||||
|
--generate-code=arch=compute_80,code=[compute_80,sm_80] \
|
||||||
|
--generate-code=arch=compute_86,code=[compute_86,sm_86] \
|
||||||
|
--generate-code=arch=compute_87,code=[compute_87,sm_87] \
|
||||||
|
--generate-code=arch=compute_89,code=[compute_89,sm_89] \
|
||||||
|
--generate-code=arch=compute_90,code=[compute_90,sm_90] \
|
||||||
|
--generate-code=arch=compute_90a,code=[compute_90a,sm_90a] \
|
||||||
|
-DGGML_CUDA_USE_GRAPHS=on
|
||||||
|
|
||||||
HIP_ARCHS := gfx900 gfx940 gfx941 gfx942 gfx1010 gfx1012 gfx1030 gfx1100 gfx1101 gfx1102
|
HIP_ARCHS := gfx900 gfx940 gfx941 gfx942 gfx1010 gfx1012 gfx1030 gfx1100 gfx1101 gfx1102
|
||||||
LINUX_HIP_ARCHS := gfx906:xnack- gfx908:xnack- gfx90a:xnack+ gfx90a:xnack-
|
LINUX_HIP_ARCHS := gfx906:xnack- gfx908:xnack- gfx90a:xnack+ gfx90a:xnack-
|
||||||
|
|
||||||
@ -111,79 +198,181 @@ HIP_FLAGS := \
|
|||||||
-I. \
|
-I. \
|
||||||
$(foreach arch, $(HIP_ARCHS), --offload-arch=$(arch))
|
$(foreach arch, $(HIP_ARCHS), --offload-arch=$(arch))
|
||||||
|
|
||||||
ifeq ($(OS), Linux)
|
ifeq ($(OS),linux)
|
||||||
HIP_FLAGS += $(foreach arch, $(LINUX_HIP_ARCHS), --offload-arch=$(arch)) -fPIC
|
HIP_FLAGS += $(foreach arch, $(LINUX_HIP_ARCHS), --offload-arch=$(arch)) -fPIC -Wno-unused-function
|
||||||
CUDA_FLAGS += -fPIC
|
CUDA_FLAGS += -fPIC -Wno-unused-function
|
||||||
CFLAGS += -Xcompiler -fPIC -D_GNU_SOURCE
|
NVCC_CFLAGS = $(CFLAGS) -Xcompiler -fPIC -D_GNU_SOURCE
|
||||||
CXXFLAGS += -Xcompiler -fPIC -D_GNU_SOURCE
|
NVCC_CXXFLAGS = $(CXXFLAGS) -Xcompiler -fPIC -D_GNU_SOURCE
|
||||||
else ifeq ($(OS),Windows)
|
HIPCC_CFLAGS = $(CFLAGS) -fPIC -D_GNU_SOURCE
|
||||||
|
HIPCC_CXXFLAGS = $(CXXFLAGS) -fPIC -D_GNU_SOURCE
|
||||||
|
else ifeq ($(OS),windows)
|
||||||
HIP_FLAGS += -Xclang --dependent-lib=msvcrt
|
HIP_FLAGS += -Xclang --dependent-lib=msvcrt
|
||||||
|
CFLAGS += -D_WIN32_WINNT=0x602
|
||||||
|
CXXFLAGS += -D_WIN32_WINNT=0x602
|
||||||
|
NVCC_CFLAGS = $(CFLAGS)
|
||||||
|
NVCC_CXXFLAGS = $(CXXFLAGS)
|
||||||
|
HIPCC_CFLAGS = $(CFLAGS)
|
||||||
|
HIPCC_CXXFLAGS = $(CXXFLAGS)
|
||||||
endif
|
endif
|
||||||
|
|
||||||
RUNNERS := $(BUILD_DIR)ollama_runner$(EXE_EXT)
|
ifeq ($(OLLAMA_SKIP_CPU_GENERATE),)
|
||||||
|
RUNNERS := $(DEFAULT_RUNNER)
|
||||||
ifeq ($(ARCH),amd64)
|
ifeq ($(ARCH),amd64)
|
||||||
RUNNERS += $(BUILD_DIR)ollama_runner_avx$(EXE_EXT) $(BUILD_DIR)ollama_runner_avx2$(EXE_EXT)
|
RUNNERS += cpu_avx cpu_avx2
|
||||||
endif
|
endif
|
||||||
ifneq ($(NVCC),)
|
|
||||||
RUNNERS += $(BUILD_DIR)ollama_runner_cuda$(EXE_EXT)
|
|
||||||
endif
|
endif
|
||||||
|
ifeq ($(OLLAMA_SKIP_CUDA_GENERATE),)
|
||||||
|
ifneq ($(CUDA_11),)
|
||||||
|
RUNNERS += cuda_v11
|
||||||
|
endif
|
||||||
|
ifneq ($(CUDA_12),)
|
||||||
|
RUNNERS += cuda_v12
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
ifeq ($(OLLAMA_SKIP_ROCM_GENERATE),)
|
||||||
ifneq ($(HIPCC),)
|
ifneq ($(HIPCC),)
|
||||||
RUNNERS += $(BUILD_DIR)ollama_runner_rocm$(EXE_EXT)
|
RUNNERS += rocm$(ROCM_VARIANT)
|
||||||
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
runners: $(RUNNERS)
|
DIST_RUNNERS = $(addprefix $(RUNNERS_DIST_DIR)/,$(addsuffix /ollama_runner$(EXE_EXT),$(RUNNERS)))
|
||||||
|
PAYLOAD_RUNNERS = $(addprefix $(RUNNERS_PAYLOAD_DIR)/,$(addsuffix /ollama_runner$(EXE_EXT).gz,$(addsuffix /bin,$(RUNNERS))))
|
||||||
|
BUILD_RUNNERS = $(addprefix $(RUNNERS_BUILD_DIR)/,$(addsuffix /ollama_runner$(EXE_EXT),$(RUNNERS)))
|
||||||
|
|
||||||
$(BUILD_DIR)%.cuda.$(OBJ_EXT): %.cu
|
all: dist payload
|
||||||
|
|
||||||
|
dist: $(DIST_RUNNERS) $(ROCBLAS_DEP_MANIFEST)
|
||||||
|
|
||||||
|
ifeq ($(OS),windows)
|
||||||
|
# Unused on windows as we don't cary the payloads in the go binary
|
||||||
|
payload:
|
||||||
|
else
|
||||||
|
payload: $(PAYLOAD_RUNNERS)
|
||||||
|
endif
|
||||||
|
|
||||||
|
runners: $(BUILD_RUNNERS)
|
||||||
|
|
||||||
|
$(BUILD_DIR)/cuda_v11/%.cuda.$(OBJ_EXT): %.cu
|
||||||
@-mkdir -p $(dir $@)
|
@-mkdir -p $(dir $@)
|
||||||
$(NVCC) -c $(CUDA_FLAGS) -o $@ $<
|
$(NVCC_11) -c $(CUDA_FLAGS) $(CUDA_11_FLAGS) -o $@ $<
|
||||||
|
|
||||||
$(BUILD_DIR)%.cuda.$(OBJ_EXT): %.c
|
$(BUILD_DIR)/cuda_v11/%.cuda.$(OBJ_EXT): %.c
|
||||||
@-mkdir -p $(dir $@)
|
@-mkdir -p $(dir $@)
|
||||||
$(NVCC) -c $(CFLAGS) -o $@ $<
|
$(NVCC_11) -c $(NVCC_CFLAGS) -o $@ $<
|
||||||
|
|
||||||
$(BUILD_DIR)%.cuda.$(OBJ_EXT): %.cpp
|
$(BUILD_DIR)/cuda_v11/%.cuda.$(OBJ_EXT): %.cpp
|
||||||
@-mkdir -p $(dir $@)
|
@-mkdir -p $(dir $@)
|
||||||
$(NVCC) -c $(CXXFLAGS) -o $@ $<
|
$(NVCC_11) -c $(NVCC_CXXFLAGS) -o $@ $<
|
||||||
|
|
||||||
$(BUILD_DIR)$(SHARED_PREFIX)ggml_cuda.$(SHARED_EXT): $(CUDA_OBJS)
|
$(BUILD_DIR)/cuda_v12/%.cuda.$(OBJ_EXT): %.cu
|
||||||
@-mkdir -p $(dir $@)
|
@-mkdir -p $(dir $@)
|
||||||
$(NVCC) --shared -lcuda -lcublas -lcudart -lcublasLt $(CUDA_FLAGS) $(CUDA_OBJS) -o $@
|
$(NVCC_12) -c $(CUDA_FLAGS) $(CUDA_12_FLAGS) -o $@ $<
|
||||||
|
|
||||||
$(BUILD_DIR)%.hip.$(OBJ_EXT): %.cu
|
$(BUILD_DIR)/cuda_v12/%.cuda.$(OBJ_EXT): %.c
|
||||||
|
@-mkdir -p $(dir $@)
|
||||||
|
$(NVCC_12) -c $(NVCC_CFLAGS) -o $@ $<
|
||||||
|
|
||||||
|
$(BUILD_DIR)/cuda_v12/%.cuda.$(OBJ_EXT): %.cpp
|
||||||
|
@-mkdir -p $(dir $@)
|
||||||
|
$(NVCC_12) -c $(NVCC_CXXFLAGS) -o $@ $<
|
||||||
|
|
||||||
|
$(RUNNERS_DIST_DIR)/%: $(RUNNERS_BUILD_DIR)/%
|
||||||
|
@-mkdir -p $(dir $@)
|
||||||
|
cp $< $@
|
||||||
|
|
||||||
|
$(RUNNERS_DIST_DIR)/cuda_v11/ollama_runner$(EXE_EXT): $(RUNNERS_DIST_DIR)/cuda_v11/$(SHARED_PREFIX)ggml_cuda.$(SHARED_EXT)
|
||||||
|
$(RUNNERS_PAYLOAD_DIR)/cuda_v11/bin/ollama_runner$(EXE_EXT).gz: $(RUNNERS_PAYLOAD_DIR)/cuda_v11/bin/$(SHARED_PREFIX)ggml_cuda.$(SHARED_EXT).gz
|
||||||
|
$(RUNNERS_DIST_DIR)/cuda_v12/ollama_runner$(EXE_EXT): $(RUNNERS_DIST_DIR)/cuda_v12/$(SHARED_PREFIX)ggml_cuda.$(SHARED_EXT)
|
||||||
|
$(RUNNERS_PAYLOAD_DIR)/cuda_v12/bin/ollama_runner$(EXE_EXT).gz: $(RUNNERS_PAYLOAD_DIR)/cuda_v12/bin/$(SHARED_PREFIX)ggml_cuda.$(SHARED_EXT).gz
|
||||||
|
|
||||||
|
$(RUNNERS_BUILD_DIR)/cuda_v11/$(SHARED_PREFIX)ggml_cuda.$(SHARED_EXT): $(CUDA_11_OBJS) $(CUDA_11_LIB_DEPS)
|
||||||
|
@-mkdir -p $(dir $@)
|
||||||
|
$(NVCC_11) --shared -lcuda -L${CUDA_DEPS_DIR} $(foreach lib, $(CUDA_LIBS_SHORT), -l$(lib)) $(CUDA_FLAGS) $(CUDA_11_FLAGS) $(CUDA_11_OBJS) -o $@
|
||||||
|
|
||||||
|
$(RUNNERS_BUILD_DIR)/cuda_v12/$(SHARED_PREFIX)ggml_cuda.$(SHARED_EXT): $(CUDA_12_OBJS) $(CUDA_12_LIB_DEPS)
|
||||||
|
@-mkdir -p $(dir $@)
|
||||||
|
$(NVCC_12) --shared -lcuda -L${CUDA_DEPS_DIR} $(foreach lib, $(CUDA_LIBS_SHORT), -l$(lib)) $(CUDA_FLAGS) $(CUDA_12_FLAGS) $(CUDA_12_OBJS) -o $@
|
||||||
|
|
||||||
|
$(CUDA_11_LIB_DEPS):
|
||||||
|
@-mkdir -p $(dir $@)
|
||||||
|
$(CP) $(CUDA_11_LIB_DIR)/$(notdir $@)* $(dir $@)
|
||||||
|
|
||||||
|
$(CUDA_12_LIB_DEPS):
|
||||||
|
@-mkdir -p $(dir $@)
|
||||||
|
$(CP) $(CUDA_12_LIB_DIR)/$(notdir $@)* $(dir $@)
|
||||||
|
|
||||||
|
$(BUILD_DIR)/%.hip.$(OBJ_EXT): %.cu
|
||||||
@-mkdir -p $(dir $@)
|
@-mkdir -p $(dir $@)
|
||||||
$(HIPCC) -c $(HIP_FLAGS) -o $@ $<
|
$(HIPCC) -c $(HIP_FLAGS) -o $@ $<
|
||||||
|
|
||||||
$(BUILD_DIR)%.hip.$(OBJ_EXT): %.c
|
$(BUILD_DIR)/%.hip.$(OBJ_EXT): %.c
|
||||||
@-mkdir -p $(dir $@)
|
@-mkdir -p $(dir $@)
|
||||||
$(HIPCC) -c $(CFLAGS) -o $@ $<
|
$(HIPCC) -c $(HIPCC_CFLAGS) -o $@ $<
|
||||||
|
|
||||||
$(BUILD_DIR)%.hip.$(OBJ_EXT): %.cpp
|
$(BUILD_DIR)/%.hip.$(OBJ_EXT): %.cpp
|
||||||
@-mkdir -p $(dir $@)
|
@-mkdir -p $(dir $@)
|
||||||
$(HIPCC) -c $(CXXFLAGS) -o $@ $<
|
$(HIPCC) -c $(HIPCC_CXXFLAGS) -o $@ $<
|
||||||
|
|
||||||
$(BUILD_DIR)$(SHARED_PREFIX)ggml_hipblas.$(SHARED_EXT): $(HIP_OBJS)
|
$(RUNNERS_DIST_DIR)/rocm$(ROCM_VARIANT)/ollama_runner$(EXE_EXT): $(RUNNERS_DIST_DIR)/rocm$(ROCM_VARIANT)/$(SHARED_PREFIX)ggml_hipblas.$(SHARED_EXT)
|
||||||
|
$(RUNNERS_PAYLOAD_DIR)/rocm$(ROCM_VARIANT)/bin/ollama_runner$(EXE_EXT).gz: $(RUNNERS_PAYLOAD_DIR)/rocm$(ROCM_VARIANT)/bin/$(SHARED_PREFIX)ggml_hipblas.$(SHARED_EXT).gz
|
||||||
|
|
||||||
|
$(RUNNERS_BUILD_DIR)/rocm$(ROCM_VARIANT)/$(SHARED_PREFIX)ggml_hipblas.$(SHARED_EXT): $(HIP_OBJS) $(ROCM_LIB_DEPS)
|
||||||
@-mkdir -p $(dir $@)
|
@-mkdir -p $(dir $@)
|
||||||
$(HIPCC) --shared -lhipblas -lamdhip64 -lrocblas $(HIP_OBJS) -o $@
|
$(HIPCC) --shared -lamdhip64 -L${ROCM_DEPS_DIR} $(foreach lib, $(ROCM_LIBS_SHORT), -l$(lib)) $(HIP_OBJS) -o $@
|
||||||
|
|
||||||
$(BUILD_DIR)ollama_runner$(EXE_EXT):
|
$(ROCM_LIB_DEPS):
|
||||||
|
@-mkdir -p $(dir $@)
|
||||||
|
$(CP) $(HIP_LIB_DIR)/$(notdir $@)* $(dir $@)
|
||||||
|
|
||||||
|
$(RUNNERS_BUILD_DIR)/$(DEFAULT_RUNNER)/ollama_runner$(EXE_EXT): *.go ./runner/*.go
|
||||||
|
@-mkdir -p $(dir $@)
|
||||||
CGO_ENABLED=1 GOARCH=$(ARCH) go build -ldflags "-s -w" -o $@ ./runner
|
CGO_ENABLED=1 GOARCH=$(ARCH) go build -ldflags "-s -w" -o $@ ./runner
|
||||||
|
|
||||||
$(BUILD_DIR)ollama_runner_avx$(EXE_EXT):
|
$(RUNNERS_BUILD_DIR)/cpu_avx/ollama_runner$(EXE_EXT): *.go ./runner/*.go
|
||||||
|
@-mkdir -p $(dir $@)
|
||||||
CGO_ENABLED=1 GOARCH=$(ARCH) go build -ldflags "-s -w" -tags avx -o $@ ./runner
|
CGO_ENABLED=1 GOARCH=$(ARCH) go build -ldflags "-s -w" -tags avx -o $@ ./runner
|
||||||
|
|
||||||
$(BUILD_DIR)ollama_runner_avx2$(EXE_EXT):
|
$(RUNNERS_BUILD_DIR)/cpu_avx2/ollama_runner$(EXE_EXT): *.go ./runner/*.go
|
||||||
|
@-mkdir -p $(dir $@)
|
||||||
CGO_ENABLED=1 GOARCH=$(ARCH) go build -ldflags "-s -w" -tags avx,avx2 -o $@ ./runner
|
CGO_ENABLED=1 GOARCH=$(ARCH) go build -ldflags "-s -w" -tags avx,avx2 -o $@ ./runner
|
||||||
|
|
||||||
$(BUILD_DIR)ollama_runner_cuda$(EXE_EXT): $(BUILD_DIR)$(SHARED_PREFIX)ggml_cuda.$(SHARED_EXT)
|
$(RUNNERS_BUILD_DIR)/cuda_v11/ollama_runner$(EXE_EXT): $(RUNNERS_BUILD_DIR)/cuda_v11/$(SHARED_PREFIX)ggml_cuda.$(SHARED_EXT) *.go ./runner/*.go
|
||||||
CGO_ENABLED=1 GOARCH=$(ARCH) CGO_LDFLAGS=-L"$(CUDA_LIB)" go build -ldflags "-s -w" -tags avx,cuda -o $@ ./runner
|
@-mkdir -p $(dir $@)
|
||||||
|
CGO_ENABLED=1 GOARCH=$(ARCH) CGO_LDFLAGS=-L"$(RUNNERS_BUILD_DIR)/cuda_v11/" go build -ldflags "-s -w" -tags avx,cuda -o $@ ./runner
|
||||||
|
|
||||||
$(BUILD_DIR)ollama_runner_rocm$(EXE_EXT): $(BUILD_DIR)$(SHARED_PREFIX)ggml_hipblas.$(SHARED_EXT)
|
$(RUNNERS_BUILD_DIR)/cuda_v12/ollama_runner$(EXE_EXT): $(RUNNERS_BUILD_DIR)/cuda_v12/$(SHARED_PREFIX)ggml_cuda.$(SHARED_EXT) *.go ./runner/*.go
|
||||||
CGO_ENABLED=1 GOARCH=$(ARCH) CGO_LDFLAGS=-L"$(HIP_LIB)" go build -ldflags "-s -w" -tags avx,rocm -o $@ ./runner
|
@-mkdir -p $(dir $@)
|
||||||
|
CGO_ENABLED=1 GOARCH=$(ARCH) CGO_LDFLAGS=-L"$(RUNNERS_BUILD_DIR)/cuda_v12/" go build -ldflags "-s -w" -tags avx,cuda -o $@ ./runner
|
||||||
|
|
||||||
|
$(RUNNERS_BUILD_DIR)/rocm$(ROCM_VARIANT)/ollama_runner$(EXE_EXT): $(RUNNERS_BUILD_DIR)/rocm$(ROCM_VARIANT)/$(SHARED_PREFIX)ggml_hipblas.$(SHARED_EXT) *.go ./runner/*.go
|
||||||
|
@-mkdir -p $(dir $@)
|
||||||
|
CGO_ENABLED=1 GOARCH=$(ARCH) CGO_LDFLAGS=-L"$(RUNNERS_BUILD_DIR)/rocm$(ROCM_VARIANT)/" go build -ldflags "-s -w" -tags avx,rocm -o $@ ./runner
|
||||||
|
|
||||||
|
|
||||||
|
$(ROCBLAS_DEP_MANIFEST):
|
||||||
|
@-mkdir -p $(dir $@)
|
||||||
|
@echo "Copying rocblas library..."
|
||||||
|
cd $(HIP_LIB_DIR)/rocblas/library/ && tar cf - . | (cd $(dir $@) && tar xf - )
|
||||||
|
@echo "rocblas library copy complete"
|
||||||
|
|
||||||
|
|
||||||
|
$(RUNNERS_PAYLOAD_DIR)/%/bin/ollama_runner.gz: $(RUNNERS_BUILD_DIR)/%/ollama_runner
|
||||||
|
@-mkdir -p $(dir $@)
|
||||||
|
gzip --best -c $< > $@
|
||||||
|
$(RUNNERS_PAYLOAD_DIR)/cuda_v11/bin/%.gz: $(RUNNERS_BUILD_DIR)/cuda_v11/%
|
||||||
|
@-mkdir -p $(dir $@)
|
||||||
|
gzip --best -c $< > $@
|
||||||
|
$(RUNNERS_PAYLOAD_DIR)/cuda_v12/bin/%.gz: $(RUNNERS_BUILD_DIR)/cuda_v12/%
|
||||||
|
@-mkdir -p $(dir $@)
|
||||||
|
gzip --best -c $< > $@
|
||||||
|
$(RUNNERS_PAYLOAD_DIR)/rocm$(ROCM_VARIANT)/bin/%.gz: $(RUNNERS_BUILD_DIR)/rocm$(ROCM_VARIANT)/%
|
||||||
|
@-mkdir -p $(dir $@)
|
||||||
|
gzip --best -c $< > $@
|
||||||
|
|
||||||
clean:
|
clean:
|
||||||
rm -rf $(BUILD_DIR)
|
rm -rf $(BUILD_DIR) $(DIST_RUNNERS) $(PAYLOAD_RUNNERS)
|
||||||
|
|
||||||
.PHONY: runners clean ollama_runner$(EXE_EXT) ollama_runner_avx$(EXE_EXT) ollama_runner_avx2$(EXE_EXT) ollama_runner_cuda$(EXE_EXT) ollama_runner_rocm$(EXE_EXT)
|
.PHONY: all dist payload runners clean $(RUNNERS)
|
||||||
|
|
||||||
# Handy debugging for make variables
|
# Handy debugging for make variables
|
||||||
print-%:
|
print-%:
|
||||||
|
@ -10,6 +10,8 @@ import (
|
|||||||
"math"
|
"math"
|
||||||
"net"
|
"net"
|
||||||
"net/http"
|
"net/http"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
"runtime"
|
"runtime"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
@ -146,7 +148,7 @@ func (s *Server) run(ctx context.Context) {
|
|||||||
case <-ctx.Done():
|
case <-ctx.Done():
|
||||||
return
|
return
|
||||||
default:
|
default:
|
||||||
slog.Info("Processing batch", "seqs", len(s.seqs))
|
slog.Debug("Processing batch", "seqs", len(s.seqs))
|
||||||
s.mu.Lock()
|
s.mu.Lock()
|
||||||
for s.allNil() {
|
for s.allNil() {
|
||||||
s.cond.Wait() // Wait until an item is added
|
s.cond.Wait() // Wait until an item is added
|
||||||
@ -186,6 +188,7 @@ func (s *Server) run(ctx context.Context) {
|
|||||||
|
|
||||||
err := s.lc.Decode(batch)
|
err := s.lc.Decode(batch)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
slog.Error("failed to decode batch", "error", err)
|
||||||
panic("Failed to decode")
|
panic("Failed to decode")
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -227,7 +230,7 @@ func (s *Server) run(ctx context.Context) {
|
|||||||
|
|
||||||
seq.numPredicted++
|
seq.numPredicted++
|
||||||
|
|
||||||
slog.Info("sampled", "piece", piece)
|
slog.Debug("sampled", "piece", piece)
|
||||||
|
|
||||||
// if it's an end of sequence token, break
|
// if it's an end of sequence token, break
|
||||||
// TODO: just end this sequence
|
// TODO: just end this sequence
|
||||||
@ -463,7 +466,6 @@ type HealthResponse struct {
|
|||||||
// TODO (jmorganca): is it safe to do this concurrently with decoding?
|
// TODO (jmorganca): is it safe to do this concurrently with decoding?
|
||||||
func (s *Server) health(w http.ResponseWriter, r *http.Request) {
|
func (s *Server) health(w http.ResponseWriter, r *http.Request) {
|
||||||
w.Header().Set("Content-Type", "application/json")
|
w.Header().Set("Content-Type", "application/json")
|
||||||
|
|
||||||
if err := json.NewEncoder(w).Encode(&HealthResponse{
|
if err := json.NewEncoder(w).Encode(&HealthResponse{
|
||||||
Status: s.status,
|
Status: s.status,
|
||||||
Progress: s.progress,
|
Progress: s.progress,
|
||||||
@ -475,17 +477,63 @@ func (s *Server) health(w http.ResponseWriter, r *http.Request) {
|
|||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
mpath := flag.String("model", "", "Path to model binary file")
|
mpath := flag.String("model", "", "Path to model binary file")
|
||||||
ppath := flag.String("projector", "", "Path to projector binary file")
|
ppath := flag.String("mmproj", "", "Path to projector binary file")
|
||||||
parallel := flag.Int("parallel", 1, "Number of sequences to handle simultaneously")
|
parallel := flag.Int("parallel", 1, "Number of sequences to handle simultaneously")
|
||||||
batchSize := flag.Int("batch-size", 512, "Batch size")
|
batchSize := flag.Int("batch-size", 512, "Batch size")
|
||||||
nGpuLayers := flag.Int("num-gpu", 0, "Number of layers to offload to GPU")
|
nGpuLayers := flag.Int("n-gpu-layers", 0, "Number of layers to offload to GPU")
|
||||||
mainGpu := flag.Int("main-gpu", 0, "Main GPU")
|
mainGpu := flag.Int("main-gpu", 0, "Main GPU")
|
||||||
flashAttention := flag.Bool("flash-attention", false, "Enable flash attention")
|
flashAttention := flag.Bool("flash-attn", false, "Enable flash attention")
|
||||||
numCtx := flag.Int("num-ctx", 2048, "Context (or KV cache) size")
|
numCtx := flag.Int("ctx-size", 2048, "Context (or KV cache) size")
|
||||||
lpath := flag.String("lora", "", "Path to lora layer file")
|
lpath := flag.String("lora", "", "Path to lora layer file")
|
||||||
port := flag.Int("port", 8080, "Port to expose the server on")
|
port := flag.Int("port", 8080, "Port to expose the server on")
|
||||||
threads := flag.Int("threads", runtime.NumCPU(), "Number of threads to use during generation")
|
threads := flag.Int("threads", runtime.NumCPU(), "Number of threads to use during generation")
|
||||||
|
|
||||||
|
// TODO not yet implemented but wired to keep the parsing aligned
|
||||||
|
embedding := flag.Bool("embedding", false, "enable embedding vector output (default: disabled)")
|
||||||
|
logDisable := flag.Bool("log-disable", false, "disables logging to a file")
|
||||||
|
verbose := flag.Bool("verbose", false, "verbose output (default: disabled)")
|
||||||
|
f32 := flag.Bool("memory-f32", false, "use f32 instead of f16 for memory key+value (default: disabled) not recommended: doubles context memory required and no measurable increase in quality")
|
||||||
|
noMmap := flag.Bool("no-mmap", false, "do not memory-map model (slower load but may reduce pageouts if not using mlock)")
|
||||||
|
mlock := flag.Bool("mlock", false, "force system to keep model in RAM rather than swapping or compressing")
|
||||||
|
tensorSplit := flag.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions")
|
||||||
|
|
||||||
flag.Parse()
|
flag.Parse()
|
||||||
|
level := slog.LevelInfo
|
||||||
|
if *verbose {
|
||||||
|
level = slog.LevelDebug
|
||||||
|
}
|
||||||
|
handler := slog.NewTextHandler(os.Stderr, &slog.HandlerOptions{
|
||||||
|
Level: level,
|
||||||
|
AddSource: true,
|
||||||
|
ReplaceAttr: func(_ []string, attr slog.Attr) slog.Attr {
|
||||||
|
if attr.Key == slog.SourceKey {
|
||||||
|
source := attr.Value.Any().(*slog.Source)
|
||||||
|
source.File = filepath.Base(source.File)
|
||||||
|
}
|
||||||
|
return attr
|
||||||
|
},
|
||||||
|
})
|
||||||
|
slog.SetDefault(slog.New(handler))
|
||||||
|
|
||||||
|
// TODO actually implement...
|
||||||
|
if *embedding {
|
||||||
|
slog.Warn("embeddings not yet support")
|
||||||
|
}
|
||||||
|
if *logDisable {
|
||||||
|
slog.Info("ignoring --log-disable")
|
||||||
|
}
|
||||||
|
if *f32 {
|
||||||
|
slog.Warn("memory-f32 not yet supported")
|
||||||
|
}
|
||||||
|
if *noMmap {
|
||||||
|
slog.Warn("no-mmap not yet supported")
|
||||||
|
}
|
||||||
|
if *mlock {
|
||||||
|
slog.Warn("mlock not yet supported")
|
||||||
|
}
|
||||||
|
if *tensorSplit != "" {
|
||||||
|
slog.Warn("tensor-split not yet implemented")
|
||||||
|
}
|
||||||
|
|
||||||
server := &Server{
|
server := &Server{
|
||||||
numCtx: *numCtx,
|
numCtx: *numCtx,
|
||||||
@ -498,7 +546,7 @@ func main() {
|
|||||||
// load the model
|
// load the model
|
||||||
llama.BackendInit()
|
llama.BackendInit()
|
||||||
params := llama.NewModelParams(*nGpuLayers, *mainGpu, func(progress float32) {
|
params := llama.NewModelParams(*nGpuLayers, *mainGpu, func(progress float32) {
|
||||||
slog.Info("Loading model", "progress %", math.Round(float64(progress*100)))
|
slog.Debug("Loading model", "progress %", math.Round(float64(progress*100)))
|
||||||
server.progress = progress
|
server.progress = progress
|
||||||
})
|
})
|
||||||
server.model = llama.LoadModelFromFile(*mpath, params)
|
server.model = llama.LoadModelFromFile(*mpath, params)
|
||||||
@ -531,7 +579,7 @@ func main() {
|
|||||||
defer listener.Close()
|
defer listener.Close()
|
||||||
|
|
||||||
mux := http.NewServeMux()
|
mux := http.NewServeMux()
|
||||||
mux.HandleFunc("/embeddings", server.embeddings)
|
mux.HandleFunc("/embedding", server.embeddings)
|
||||||
mux.HandleFunc("/completion", server.completion)
|
mux.HandleFunc("/completion", server.completion)
|
||||||
mux.HandleFunc("/health", server.health)
|
mux.HandleFunc("/health", server.health)
|
||||||
|
|
||||||
@ -539,7 +587,7 @@ func main() {
|
|||||||
Handler: mux,
|
Handler: mux,
|
||||||
}
|
}
|
||||||
|
|
||||||
server.status = "ready"
|
server.status = "ok"
|
||||||
|
|
||||||
log.Println("Server listening on", addr)
|
log.Println("Server listening on", addr)
|
||||||
if err := httpServer.Serve(listener); err != nil {
|
if err := httpServer.Serve(listener); err != nil {
|
||||||
|
66
llm/llm.go
66
llm/llm.go
@ -12,6 +12,7 @@ package llm
|
|||||||
import "C"
|
import "C"
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"log/slog"
|
||||||
"unsafe"
|
"unsafe"
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -37,3 +38,68 @@ func Quantize(infile, outfile string, ftype fileType) error {
|
|||||||
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type loadedModel struct {
|
||||||
|
model *C.struct_llama_model
|
||||||
|
}
|
||||||
|
|
||||||
|
func loadModel(modelfile string, vocabOnly bool) (*loadedModel, error) {
|
||||||
|
// TODO figure out how to quiet down the logging so we don't have 2 copies of the model metadata showing up
|
||||||
|
slog.Info("XXX initializing default model params")
|
||||||
|
params := C.llama_model_default_params()
|
||||||
|
params.vocab_only = C.bool(vocabOnly)
|
||||||
|
|
||||||
|
cmodelfile := C.CString(modelfile)
|
||||||
|
defer C.free(unsafe.Pointer(cmodelfile))
|
||||||
|
|
||||||
|
slog.Info("XXX loading model", "model", modelfile)
|
||||||
|
model := C.llama_load_model_from_file(cmodelfile, params)
|
||||||
|
if model == nil {
|
||||||
|
return nil, fmt.Errorf("failed to load model %s", modelfile)
|
||||||
|
}
|
||||||
|
return &loadedModel{model}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func freeModel(model *loadedModel) {
|
||||||
|
C.llama_free_model(model.model)
|
||||||
|
}
|
||||||
|
|
||||||
|
func tokenize(model *loadedModel, content string) ([]int, error) {
|
||||||
|
ccontent := C.CString(content)
|
||||||
|
defer C.free(unsafe.Pointer(ccontent))
|
||||||
|
|
||||||
|
len := len(content) + 2
|
||||||
|
tokens := make([]C.int32_t, len)
|
||||||
|
|
||||||
|
tokenCount := C.llama_tokenize(model.model, ccontent, C.int32_t(len), &tokens[0], C.int32_t(len), true, true)
|
||||||
|
if tokenCount < 0 {
|
||||||
|
slog.Info("XXX got negative response", "count", tokenCount)
|
||||||
|
tokens = make([]C.int32_t, int(tokenCount))
|
||||||
|
tokenCount = C.llama_tokenize(model.model, ccontent, C.int32_t(len), &tokens[0], tokenCount, true, true)
|
||||||
|
} else if tokenCount == 0 {
|
||||||
|
return nil, nil
|
||||||
|
}
|
||||||
|
ret := make([]int, tokenCount)
|
||||||
|
for i := range int(tokenCount) {
|
||||||
|
ret[i] = int(tokens[i])
|
||||||
|
}
|
||||||
|
slog.Debug("XXX tokenized", "tokens", tokens, "content", content)
|
||||||
|
return ret, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func detokenize(model *loadedModel, tokens []int) string {
|
||||||
|
slog.Info("XXX in CGO detokenize")
|
||||||
|
var resp string
|
||||||
|
for _, token := range tokens {
|
||||||
|
buf := make([]C.char, 8)
|
||||||
|
nTokens := C.llama_token_to_piece(model.model, C.int(token), &buf[0], 8, 0, true)
|
||||||
|
if nTokens < 0 {
|
||||||
|
buf = make([]C.char, -nTokens)
|
||||||
|
nTokens = C.llama_token_to_piece(model.model, C.int(token), &buf[0], -nTokens, 0, true)
|
||||||
|
}
|
||||||
|
tokString := C.GoStringN(&buf[0], nTokens)
|
||||||
|
resp += tokString
|
||||||
|
}
|
||||||
|
slog.Debug("XXX detokenized", "tokens", tokens, "content", resp)
|
||||||
|
return resp
|
||||||
|
}
|
||||||
|
105
llm/server.go
105
llm/server.go
@ -50,6 +50,7 @@ type llmServer struct {
|
|||||||
status *StatusWriter
|
status *StatusWriter
|
||||||
options api.Options
|
options api.Options
|
||||||
numParallel int
|
numParallel int
|
||||||
|
model *loadedModel
|
||||||
|
|
||||||
estimate MemoryEstimate
|
estimate MemoryEstimate
|
||||||
totalLayers uint64
|
totalLayers uint64
|
||||||
@ -322,6 +323,9 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
|||||||
}
|
}
|
||||||
|
|
||||||
server := filepath.Join(dir, "ollama_llama_server")
|
server := filepath.Join(dir, "ollama_llama_server")
|
||||||
|
if envconfig.NewRunners() {
|
||||||
|
server = filepath.Join(dir, "ollama_runner")
|
||||||
|
}
|
||||||
if runtime.GOOS == "windows" {
|
if runtime.GOOS == "windows" {
|
||||||
server += ".exe"
|
server += ".exe"
|
||||||
}
|
}
|
||||||
@ -329,6 +333,9 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
|||||||
// Detect tmp cleaners wiping out the file
|
// Detect tmp cleaners wiping out the file
|
||||||
_, err := os.Stat(server)
|
_, err := os.Stat(server)
|
||||||
if errors.Is(err, os.ErrNotExist) {
|
if errors.Is(err, os.ErrNotExist) {
|
||||||
|
if envconfig.NewRunners() {
|
||||||
|
return nil, fmt.Errorf("experimental runners enabled, but not present in this build")
|
||||||
|
}
|
||||||
slog.Warn("llama server disappeared, reinitializing payloads", "path", server, "error", err)
|
slog.Warn("llama server disappeared, reinitializing payloads", "path", server, "error", err)
|
||||||
err = Init()
|
err = Init()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@ -337,11 +344,16 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
m, err := loadModel(model, true)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("unable to load model for tokenization %w", err)
|
||||||
|
}
|
||||||
s := &llmServer{
|
s := &llmServer{
|
||||||
port: port,
|
port: port,
|
||||||
cmd: exec.Command(server, finalParams...),
|
cmd: exec.Command(server, finalParams...),
|
||||||
status: NewStatusWriter(os.Stderr),
|
status: NewStatusWriter(os.Stderr),
|
||||||
options: opts,
|
options: opts,
|
||||||
|
model: m,
|
||||||
estimate: estimate,
|
estimate: estimate,
|
||||||
numParallel: numParallel,
|
numParallel: numParallel,
|
||||||
sem: semaphore.NewWeighted(int64(numParallel)),
|
sem: semaphore.NewWeighted(int64(numParallel)),
|
||||||
@ -816,16 +828,16 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// slog.Debug("got line", "line", string(line))
|
||||||
evt, ok := bytes.CutPrefix(line, []byte("data: "))
|
evt, ok := bytes.CutPrefix(line, []byte("data: "))
|
||||||
if !ok {
|
if !ok {
|
||||||
return fmt.Errorf("error parsing llm response stream: %s", line)
|
evt = line
|
||||||
}
|
}
|
||||||
|
|
||||||
var c completion
|
var c completion
|
||||||
if err := json.Unmarshal(evt, &c); err != nil {
|
if err := json.Unmarshal(evt, &c); err != nil {
|
||||||
return fmt.Errorf("error unmarshalling llm prediction response: %v", err)
|
return fmt.Errorf("error unmarshalling llm prediction response: %v", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
switch {
|
switch {
|
||||||
case strings.TrimSpace(c.Content) == lastToken:
|
case strings.TrimSpace(c.Content) == lastToken:
|
||||||
tokenRepeat++
|
tokenRepeat++
|
||||||
@ -927,7 +939,7 @@ func (s *llmServer) Embedding(ctx context.Context, input string) ([]float32, err
|
|||||||
}
|
}
|
||||||
|
|
||||||
if resp.StatusCode >= 400 {
|
if resp.StatusCode >= 400 {
|
||||||
log.Printf("llm encode error: %s", body)
|
log.Printf("llm embedding error: %s", body)
|
||||||
return nil, fmt.Errorf("%s", body)
|
return nil, fmt.Errorf("%s", body)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -948,47 +960,7 @@ type TokenizeResponse struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (s *llmServer) Tokenize(ctx context.Context, content string) ([]int, error) {
|
func (s *llmServer) Tokenize(ctx context.Context, content string) ([]int, error) {
|
||||||
// Make sure the server is ready
|
return tokenize(s.model, content)
|
||||||
status, err := s.getServerStatus(ctx)
|
|
||||||
if err != nil {
|
|
||||||
return nil, err
|
|
||||||
} else if status != ServerStatusReady && status != ServerStatusNoSlotsAvailable {
|
|
||||||
return nil, fmt.Errorf("unexpected server status: %s", status.ToString())
|
|
||||||
}
|
|
||||||
|
|
||||||
data, err := json.Marshal(TokenizeRequest{Content: content})
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("marshaling encode data: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
req, err := http.NewRequestWithContext(ctx, http.MethodPost, fmt.Sprintf("http://127.0.0.1:%d/tokenize", s.port), bytes.NewBuffer(data))
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("encode request: %w", err)
|
|
||||||
}
|
|
||||||
req.Header.Set("Content-Type", "application/json")
|
|
||||||
|
|
||||||
resp, err := http.DefaultClient.Do(req)
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("do encode request: %w", err)
|
|
||||||
}
|
|
||||||
defer resp.Body.Close()
|
|
||||||
|
|
||||||
body, err := io.ReadAll(resp.Body)
|
|
||||||
if err != nil {
|
|
||||||
return nil, fmt.Errorf("read encode request: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
if resp.StatusCode >= 400 {
|
|
||||||
log.Printf("llm encode error: %s", body)
|
|
||||||
return nil, fmt.Errorf("%s", body)
|
|
||||||
}
|
|
||||||
|
|
||||||
var encoded TokenizeResponse
|
|
||||||
if err := json.Unmarshal(body, &encoded); err != nil {
|
|
||||||
return nil, fmt.Errorf("unmarshal encode response: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
return encoded.Tokens, nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
type DetokenizeRequest struct {
|
type DetokenizeRequest struct {
|
||||||
@ -1000,50 +972,13 @@ type DetokenizeResponse struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (s *llmServer) Detokenize(ctx context.Context, tokens []int) (string, error) {
|
func (s *llmServer) Detokenize(ctx context.Context, tokens []int) (string, error) {
|
||||||
// Make sure the server is ready
|
return detokenize(s.model, tokens), nil
|
||||||
status, err := s.getServerStatus(ctx)
|
|
||||||
if err != nil {
|
|
||||||
return "", err
|
|
||||||
} else if status != ServerStatusReady && status != ServerStatusNoSlotsAvailable {
|
|
||||||
return "", fmt.Errorf("unexpected server status: %s", status.ToString())
|
|
||||||
}
|
|
||||||
|
|
||||||
data, err := json.Marshal(DetokenizeRequest{Tokens: tokens})
|
|
||||||
if err != nil {
|
|
||||||
return "", fmt.Errorf("marshaling decode data: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
req, err := http.NewRequestWithContext(ctx, http.MethodPost, fmt.Sprintf("http://127.0.0.1:%d/detokenize", s.port), bytes.NewBuffer(data))
|
|
||||||
if err != nil {
|
|
||||||
return "", fmt.Errorf("decode request: %w", err)
|
|
||||||
}
|
|
||||||
req.Header.Set("Content-Type", "application/json")
|
|
||||||
|
|
||||||
resp, err := http.DefaultClient.Do(req)
|
|
||||||
if err != nil {
|
|
||||||
return "", fmt.Errorf("do decode request: %w", err)
|
|
||||||
}
|
|
||||||
defer resp.Body.Close()
|
|
||||||
|
|
||||||
body, err := io.ReadAll(resp.Body)
|
|
||||||
if err != nil {
|
|
||||||
return "", fmt.Errorf("read decode request: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
if resp.StatusCode >= 400 {
|
|
||||||
log.Printf("llm decode error: %s", body)
|
|
||||||
return "", fmt.Errorf("%s", body)
|
|
||||||
}
|
|
||||||
|
|
||||||
var decoded DetokenizeResponse
|
|
||||||
if err := json.Unmarshal(body, &decoded); err != nil {
|
|
||||||
return "", fmt.Errorf("unmarshal encode response: %w", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
return decoded.Content, nil
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *llmServer) Close() error {
|
func (s *llmServer) Close() error {
|
||||||
|
if s.model != nil {
|
||||||
|
freeModel(s.model)
|
||||||
|
}
|
||||||
if s.cmd != nil {
|
if s.cmd != nil {
|
||||||
slog.Debug("stopping llama server")
|
slog.Debug("stopping llama server")
|
||||||
if err := s.cmd.Process.Kill(); err != nil {
|
if err := s.cmd.Process.Kill(); err != nil {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user