2025-06-16 07:47:33 +00:00
14 changed files with 124 additions and 661 deletions
--- a/README.md
+++ b/README.md
@ -330,7 +330,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [LLMChat](https://github.com/trendy-design/llmchat) (Privacy focused, 100% local, intuitive all-in-one chat interface)
 - [ARGO](https://github.com/xark-argo/argo) (Locally download and run Ollama and Huggingface models with RAG on Mac/Windows/Linux)
 - [G1](https://github.com/bklieger-groq/g1) (Prototype of using prompting strategies to improve the LLM's reasoning through o1-like reasoning chains.)
 - [Ollama App](https://github.com/JHubi1/ollama-app) (Modern and easy-to-use multi-platform client for Ollama)
 ### Terminal
@ -417,7 +416,6 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Enchanted](https://github.com/AugustDev/enchanted)
 - [Maid](https://github.com/Mobile-Artificial-Intelligence/maid)
 - [Ollama App](https://github.com/JHubi1/ollama-app) (Modern and easy-to-use multi-platform client for Ollama)
 - [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy focused LLM chat interface with optional encryption)
 ### Extensions & Plugins
--- a/gpu/amd_linux.go
+++ b/gpu/amd_linux.go
@ -47,11 +47,10 @@ var (
 )
 // Gather GPU information from the amdgpu driver if any supported GPUs are detected
-// Only called once during bootstrap
+func AMDGetGPUInfo() []RocmGPUInfo {
 func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 	resp := []RocmGPUInfo{}
 	if !AMDDetected() {
-		return resp, fmt.Errorf("AMD GPUs not detected")
+		return resp
 	}
 	// Opportunistic logging of driver version to aid in troubleshooting
@ -195,9 +194,13 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 		// Shouldn't happen, but just in case...
 		if gpuID < 0 {
-			err := fmt.Errorf("unexpected amdgpu sysfs data resulted in negative GPU ID, please set OLLAMA_DEBUG=1 and report an issue")
+			slog.Error("unexpected amdgpu sysfs data resulted in negative GPU ID, please set OLLAMA_DEBUG=1 and report an issue")
-			slog.Error(err.Error())
+			return nil
-			return nil, err
+		}
 		if int(major) < RocmComputeMin {
 			slog.Warn(fmt.Sprintf("amdgpu too old gfx%d%x%x", major, minor, patch), "gpu", gpuID)
 			continue
 		}
 		// Look up the memory for the current node
@ -267,12 +270,19 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 			break
 		}
 		// iGPU detection, remove this check once we can support an iGPU variant of the rocm library
 		if totalMemory < IGPUMemLimit {
 			slog.Info("unsupported Radeon iGPU detected skipping", "id", gpuID, "total", format.HumanBytes2(totalMemory))
 			continue
 		}
 		var name string
 		// TODO - PCI ID lookup
 		if vendor > 0 && device > 0 {
 			name = fmt.Sprintf("%04x:%04x", vendor, device)
 		}
 		slog.Debug("amdgpu memory", "gpu", gpuID, "total", format.HumanBytes2(totalMemory))
 		slog.Debug("amdgpu memory", "gpu", gpuID, "available", format.HumanBytes2(totalMemory-usedMemory))
 		gpuInfo := RocmGPUInfo{
 			GpuInfo: GpuInfo{
 				Library: "rocm",
@ -290,31 +300,6 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 			usedFilepath: usedFile,
 		}
 		// iGPU detection, remove this check once we can support an iGPU variant of the rocm library
 		if totalMemory < IGPUMemLimit {
 			reason := "unsupported Radeon iGPU detected skipping"
 			slog.Info(reason, "id", gpuID, "total", format.HumanBytes2(totalMemory))
 			unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
 				GpuInfo: gpuInfo.GpuInfo,
 				Reason:  reason,
 			})
 			continue
 		}
 		if int(major) < RocmComputeMin {
 			reason := fmt.Sprintf("amdgpu too old gfx%d%x%x", major, minor, patch)
 			slog.Warn(reason, "gpu", gpuID)
 			unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
 				GpuInfo: gpuInfo.GpuInfo,
 				Reason:  reason,
 			})
 			continue
 		}
 		slog.Debug("amdgpu memory", "gpu", gpuID, "total", format.HumanBytes2(totalMemory))
 		slog.Debug("amdgpu memory", "gpu", gpuID, "available", format.HumanBytes2(totalMemory-usedMemory))
 		// If the user wants to filter to a subset of devices, filter out if we aren't a match
 		if len(visibleDevices) > 0 {
 			include := false
@ -325,13 +310,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 				}
 			}
 			if !include {
-				reason := "filtering out device per user request"
+				slog.Info("filtering out device per user request", "id", gpuInfo.ID, "visible_devices", visibleDevices)
 				slog.Info(reason, "id", gpuInfo.ID, "visible_devices", visibleDevices)
 				unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
 					GpuInfo: gpuInfo.GpuInfo,
 					Reason:  reason,
 				})
 				continue
 			}
 		}
@ -341,13 +320,8 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 		if libDir == "" {
 			libDir, err = AMDValidateLibDir()
 			if err != nil {
-				err = fmt.Errorf("unable to verify rocm library: %w", err)
+				slog.Warn("unable to verify rocm library, will use cpu", "error", err)
-				slog.Warn(err.Error())
+				return nil
 				unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
 					GpuInfo: gpuInfo.GpuInfo,
 					Reason:  err.Error(),
 				})
 				return nil, err
 			}
 		}
 		gpuInfo.DependencyPath = libDir
@ -357,25 +331,14 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 			if len(supported) == 0 {
 				supported, err = GetSupportedGFX(libDir)
 				if err != nil {
-					err = fmt.Errorf("failed to lookup supported GFX types: %w", err)
+					slog.Warn("failed to lookup supported GFX types, falling back to CPU mode", "error", err)
-					slog.Warn(err.Error())
+					return nil
 					unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
 						GpuInfo: gpuInfo.GpuInfo,
 						Reason:  err.Error(),
 					})
 					return nil, err
 				}
 				slog.Debug("rocm supported GPUs", "types", supported)
 			}
 			gfx := gpuInfo.Compute
 			if !slices.Contains[[]string, string](supported, gfx) {
-				reason := fmt.Sprintf("amdgpu is not supported (supported types:%s)", supported)
+				slog.Warn("amdgpu is not supported", "gpu", gpuInfo.ID, "gpu_type", gfx, "library", libDir, "supported_types", supported)
 				slog.Warn(reason, "gpu_type", gfx, "gpu", gpuInfo.ID, "library", libDir)
 				unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
 					GpuInfo: gpuInfo.GpuInfo,
 					Reason:  reason,
 				})
 				// TODO - consider discrete markdown just for ROCM troubleshooting?
 				slog.Warn("See https://github.com/ollama/ollama/blob/main/docs/gpu.md#overrides for HSA_OVERRIDE_GFX_VERSION usage")
 				continue
@ -395,16 +358,13 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 		resp = append(resp, gpuInfo)
 	}
 	if len(resp) == 0 {
-		err := fmt.Errorf("no compatible amdgpu devices detected")
+		slog.Info("no compatible amdgpu devices detected")
 		slog.Info(err.Error())
 		return nil, err
 	}
 	if err := verifyKFDDriverAccess(); err != nil {
-		err = fmt.Errorf("amdgpu devices detected but permission problems block access: %w", err)
+		slog.Error("amdgpu devices detected but permission problems block access", "error", err)
-		slog.Error(err.Error())
+		return nil
 		return nil, err
 	}
-	return resp, nil
+	return resp
 }
 // Quick check for AMD driver so we can skip amdgpu discovery if not present
--- a/gpu/amd_windows.go
+++ b/gpu/amd_windows.go
@ -3,7 +3,6 @@ package gpu
 import (
 	"bytes"
 	"errors"
 	"fmt"
 	"log/slog"
 	"os"
 	"path/filepath"
@ -27,13 +26,12 @@ var (
 	RocmStandardLocations = []string{"C:\\Program Files\\AMD\\ROCm\\6.1\\bin"} // TODO glob?
 )
-// Only called once during bootstrap
+func AMDGetGPUInfo() []RocmGPUInfo {
 func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 	resp := []RocmGPUInfo{}
 	hl, err := NewHipLib()
 	if err != nil {
 		slog.Debug(err.Error())
-		return nil, err
+		return nil
 	}
 	defer hl.Release()
@ -46,15 +44,12 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 	// Note: the HIP library automatically handles subsetting to any HIP_VISIBLE_DEVICES the user specified
 	count := hl.HipGetDeviceCount()
 	if count == 0 {
-		err := fmt.Errorf("no compatible amdgpu devices detected")
+		return nil
 		slog.Info(err.Error())
 		return nil, err
 	}
 	libDir, err := AMDValidateLibDir()
 	if err != nil {
-		err = fmt.Errorf("unable to verify rocm library: %w", err)
+		slog.Warn("unable to verify rocm library, will use cpu", "error", err)
-		slog.Warn(err.Error())
+		return nil
 		return nil, err
 	}
 	var supported []string
@ -62,9 +57,8 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 	if gfxOverride == "" {
 		supported, err = GetSupportedGFX(libDir)
 		if err != nil {
-			err = fmt.Errorf("failed to lookup supported GFX types: %w", err)
+			slog.Warn("failed to lookup supported GFX types, falling back to CPU mode", "error", err)
-			slog.Warn(err.Error())
+			return nil
 			return nil, err
 		}
 	} else {
 		slog.Info("skipping rocm gfx compatibility check", "HSA_OVERRIDE_GFX_VERSION", gfxOverride)
@ -93,6 +87,21 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 		slog.Debug("hip device", "id", i, "name", name, "gfx", gfx)
 		// slog.Info(fmt.Sprintf("[%d] Integrated: %d", i, props.iGPU)) // DOESN'T REPORT CORRECTLY!  Always 0
 		// TODO  Why isn't props.iGPU accurate!?
 		if strings.EqualFold(name, iGPUName) {
 			slog.Info("unsupported Radeon iGPU detected skipping", "id", i, "name", name, "gfx", gfx)
 			continue
 		}
 		if gfxOverride == "" {
 			// Strip off Target Features when comparing
 			if !slices.Contains[[]string, string](supported, strings.Split(gfx, ":")[0]) {
 				slog.Warn("amdgpu is not supported", "gpu", i, "gpu_type", gfx, "library", libDir, "supported_types", supported)
 				// TODO - consider discrete markdown just for ROCM troubleshooting?
 				slog.Warn("See https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for HSA_OVERRIDE_GFX_VERSION usage")
 				continue
 			} else {
 				slog.Debug("amdgpu is supported", "gpu", i, "gpu_type", gfx)
 			}
 		}
 		freeMemory, totalMemory, err := hl.HipMemGetInfo()
 		if err != nil {
@ -100,6 +109,14 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 			continue
 		}
 		// iGPU detection, remove this check once we can support an iGPU variant of the rocm library
 		if totalMemory < IGPUMemLimit {
 			slog.Info("amdgpu appears to be an iGPU, skipping", "gpu", i, "total", format.HumanBytes2(totalMemory))
 			continue
 		}
 		slog.Debug("amdgpu memory", "gpu", i, "total", format.HumanBytes2(totalMemory))
 		slog.Debug("amdgpu memory", "gpu", i, "available", format.HumanBytes2(freeMemory))
 		gpuInfo := RocmGPUInfo{
 			GpuInfo: GpuInfo{
 				Library: "rocm",
@ -121,38 +138,10 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 			index: i,
 		}
 		// iGPU detection, remove this check once we can support an iGPU variant of the rocm library
 		if strings.EqualFold(name, iGPUName) || totalMemory < IGPUMemLimit {
 			reason := "unsupported Radeon iGPU detected skipping"
 			slog.Info(reason, "id", gpuInfo.ID, "total", format.HumanBytes2(totalMemory))
 			unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
 				GpuInfo: gpuInfo.GpuInfo,
 				Reason:  reason,
 			})
 			continue
 		}
 		// Strip off Target Features when comparing
 		if !slices.Contains[[]string, string](supported, strings.Split(gfx, ":")[0]) {
 			reason := fmt.Sprintf("amdgpu is not supported (supported types:%s)", supported)
 			slog.Warn(reason, "gpu_type", gfx, "gpu", gpuInfo.ID, "library", libDir)
 			unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
 				GpuInfo: gpuInfo.GpuInfo,
 				Reason:  reason,
 			})
 			// HSA_OVERRIDE_GFX_VERSION not supported on windows
 			continue
 		} else {
 			slog.Debug("amdgpu is supported", "gpu", i, "gpu_type", gfx)
 		}
 		slog.Debug("amdgpu memory", "gpu", i, "total", format.HumanBytes2(totalMemory))
 		slog.Debug("amdgpu memory", "gpu", i, "available", format.HumanBytes2(freeMemory))
 		resp = append(resp, gpuInfo)
 	}
-	return resp, nil
+	return resp
 }
 func AMDValidateLibDir() (string, error) {
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@ -54,13 +54,6 @@ var (
 	nvmlLibPath   string
 	rocmGPUs      []RocmGPUInfo
 	oneapiGPUs    []OneapiGPUInfo
 	// If any discovered GPUs are incompatible, report why
 	unsupportedGPUs []UnsupportedGPUInfo
 	// Keep track of errors during bootstrapping so that if GPUs are missing
 	// they expected to be present this may explain why
 	bootstrapErrors []error
 )
 // With our current CUDA compile flags, older than 5.0 will not work properly
@ -77,17 +70,16 @@ func initCudaHandles() *cudaHandles {
 	cHandles := &cudaHandles{}
 	// Short Circuit if we already know which library to use
 	// ignore bootstrap errors in this case since we already recorded them
 	if nvmlLibPath != "" {
-		cHandles.nvml, _, _ = loadNVMLMgmt([]string{nvmlLibPath})
+		cHandles.nvml, _ = LoadNVMLMgmt([]string{nvmlLibPath})
 		return cHandles
 	}
 	if nvcudaLibPath != "" {
-		cHandles.deviceCount, cHandles.nvcuda, _, _ = loadNVCUDAMgmt([]string{nvcudaLibPath})
+		cHandles.deviceCount, cHandles.nvcuda, _ = LoadNVCUDAMgmt([]string{nvcudaLibPath})
 		return cHandles
 	}
 	if cudartLibPath != "" {
-		cHandles.deviceCount, cHandles.cudart, _, _ = loadCUDARTMgmt([]string{cudartLibPath})
+		cHandles.deviceCount, cHandles.cudart, _ = LoadCUDARTMgmt([]string{cudartLibPath})
 		return cHandles
 	}
@ -110,21 +102,18 @@ func initCudaHandles() *cudaHandles {
 	if len(NvmlGlobs) > 0 {
 		nvmlLibPaths := FindGPULibs(NvmlMgmtName, NvmlGlobs)
 		if len(nvmlLibPaths) > 0 {
-			nvml, libPath, err := loadNVMLMgmt(nvmlLibPaths)
+			nvml, libPath := LoadNVMLMgmt(nvmlLibPaths)
 			if nvml != nil {
 				slog.Debug("nvidia-ml loaded", "library", libPath)
 				cHandles.nvml = nvml
 				nvmlLibPath = libPath
 			}
 			if err != nil {
 				bootstrapErrors = append(bootstrapErrors, err)
 			}
 		}
 	}
 	nvcudaLibPaths := FindGPULibs(NvcudaMgmtName, nvcudaMgmtPatterns)
 	if len(nvcudaLibPaths) > 0 {
-		deviceCount, nvcuda, libPath, err := loadNVCUDAMgmt(nvcudaLibPaths)
+		deviceCount, nvcuda, libPath := LoadNVCUDAMgmt(nvcudaLibPaths)
 		if nvcuda != nil {
 			slog.Debug("detected GPUs", "count", deviceCount, "library", libPath)
 			cHandles.nvcuda = nvcuda
@ -132,14 +121,11 @@ func initCudaHandles() *cudaHandles {
 			nvcudaLibPath = libPath
 			return cHandles
 		}
 		if err != nil {
 			bootstrapErrors = append(bootstrapErrors, err)
 		}
 	}
 	cudartLibPaths := FindGPULibs(CudartMgmtName, cudartMgmtPatterns)
 	if len(cudartLibPaths) > 0 {
-		deviceCount, cudart, libPath, err := loadCUDARTMgmt(cudartLibPaths)
+		deviceCount, cudart, libPath := LoadCUDARTMgmt(cudartLibPaths)
 		if cudart != nil {
 			slog.Debug("detected GPUs", "library", libPath, "count", deviceCount)
 			cHandles.cudart = cudart
@ -147,9 +133,6 @@ func initCudaHandles() *cudaHandles {
 			cudartLibPath = libPath
 			return cHandles
 		}
 		if err != nil {
 			bootstrapErrors = append(bootstrapErrors, err)
 		}
 	}
 	return cHandles
@ -160,19 +143,14 @@ func initOneAPIHandles() *oneapiHandles {
 	oHandles := &oneapiHandles{}
 	// Short Circuit if we already know which library to use
 	// ignore bootstrap errors in this case since we already recorded them
 	if oneapiLibPath != "" {
-		oHandles.deviceCount, oHandles.oneapi, _, _ = loadOneapiMgmt([]string{oneapiLibPath})
+		oHandles.deviceCount, oHandles.oneapi, _ = LoadOneapiMgmt([]string{oneapiLibPath})
 		return oHandles
 	}
 	oneapiLibPaths := FindGPULibs(OneapiMgmtName, OneapiGlobs)
 	if len(oneapiLibPaths) > 0 {
-		var err error
+		oHandles.deviceCount, oHandles.oneapi, oneapiLibPath = LoadOneapiMgmt(oneapiLibPaths)
 		oHandles.deviceCount, oHandles.oneapi, oneapiLibPath, err = loadOneapiMgmt(oneapiLibPaths)
 		if err != nil {
 			bootstrapErrors = append(bootstrapErrors, err)
 		}
 	}
 	return oHandles
@ -219,7 +197,6 @@ func GetGPUInfo() GpuInfoList {
 	if !bootstrapped {
 		slog.Info("looking for compatible GPUs")
 		bootstrapErrors = []error{}
 		needRefresh = false
 		cpuCapability = GetCPUCapability()
 		var memInfo C.mem_info_t
@ -229,10 +206,7 @@ func GetGPUInfo() GpuInfoList {
 			slog.Warn("error looking up system memory", "error", err)
 		}
 		depPath := LibraryDir()
-		details, err := GetCPUDetails()
+
 		if err != nil {
 			slog.Warn("failed to lookup CPU details", "error", err)
 		}
 		cpus = []CPUInfo{
 			{
 				GpuInfo: GpuInfo{
@ -242,15 +216,12 @@ func GetGPUInfo() GpuInfoList {
 					ID:             "0",
 					DependencyPath: depPath,
 				},
 				CPUs: details,
 			},
 		}
 		// Fallback to CPU mode if we're lacking required vector extensions on x86
 		if cpuCapability < GPURunnerCPUCapability && runtime.GOARCH == "amd64" {
-			err := fmt.Errorf("CPU does not have minimum vector extensions, GPU inference disabled.  Required:%s  Detected:%s", GPURunnerCPUCapability, cpuCapability)
+			slog.Warn("CPU does not have minimum vector extensions, GPU inference disabled", "required", GPURunnerCPUCapability, "detected", cpuCapability)
 			slog.Warn(err.Error())
 			bootstrapErrors = append(bootstrapErrors, err)
 			bootstrapped = true
 			// No need to do any GPU discovery, since we can't run on them
 			return GpuInfoList{cpus[0].GpuInfo}
@ -282,6 +253,10 @@ func GetGPUInfo() GpuInfoList {
 					C.free(unsafe.Pointer(memInfo.err))
 					continue
 				}
 				if memInfo.major < CudaComputeMin[0] || (memInfo.major == CudaComputeMin[0] && memInfo.minor < CudaComputeMin[1]) {
 					slog.Info(fmt.Sprintf("[%d] CUDA GPU is too old. Compute Capability detected: %d.%d", i, memInfo.major, memInfo.minor))
 					continue
 				}
 				gpuInfo.TotalMemory = uint64(memInfo.total)
 				gpuInfo.FreeMemory = uint64(memInfo.free)
 				gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
@ -304,15 +279,6 @@ func GetGPUInfo() GpuInfoList {
 				gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
 				gpuInfo.Variant = variant
 				if memInfo.major < CudaComputeMin[0] || (memInfo.major == CudaComputeMin[0] && memInfo.minor < CudaComputeMin[1]) {
 					unsupportedGPUs = append(unsupportedGPUs,
 						UnsupportedGPUInfo{
 							GpuInfo: gpuInfo.GpuInfo,
 						})
 					slog.Info(fmt.Sprintf("[%d] CUDA GPU is too old. Compute Capability detected: %d.%d", i, memInfo.major, memInfo.minor))
 					continue
 				}
 				// query the management library as well so we can record any skew between the two
 				// which represents overhead on the GPU we must set aside on subsequent updates
 				if cHandles.nvml != nil {
@ -375,10 +341,7 @@ func GetGPUInfo() GpuInfoList {
 			}
 		}
-		rocmGPUs, err = AMDGetGPUInfo()
+		rocmGPUs = AMDGetGPUInfo()
 		if err != nil {
 			bootstrapErrors = append(bootstrapErrors, err)
 		}
 		bootstrapped = true
 		if len(cudaGPUs) == 0 && len(rocmGPUs) == 0 && len(oneapiGPUs) == 0 {
 			slog.Info("no compatible GPUs were discovered")
@ -563,114 +526,92 @@ func FindGPULibs(baseLibName string, defaultPatterns []string) []string {
 	return gpuLibPaths
 }
-// Bootstrap the runtime library
+func LoadCUDARTMgmt(cudartLibPaths []string) (int, *C.cudart_handle_t, string) {
 // Returns: num devices, handle, libPath, error
 func loadCUDARTMgmt(cudartLibPaths []string) (int, *C.cudart_handle_t, string, error) {
 	var resp C.cudart_init_resp_t
 	resp.ch.verbose = getVerboseState()
 	var err error
 	for _, libPath := range cudartLibPaths {
 		lib := C.CString(libPath)
 		defer C.free(unsafe.Pointer(lib))
 		C.cudart_init(lib, &resp)
 		if resp.err != nil {
-			err = fmt.Errorf("Unable to load cudart library %s: %s", libPath, C.GoString(resp.err))
+			slog.Debug("Unable to load cudart", "library", libPath, "error", C.GoString(resp.err))
 			slog.Debug(err.Error())
 			C.free(unsafe.Pointer(resp.err))
 		} else {
-			err = nil
+			return int(resp.num_devices), &resp.ch, libPath
 			return int(resp.num_devices), &resp.ch, libPath, err
 		}
 	}
-	return 0, nil, "", err
+	return 0, nil, ""
 }
-// Bootstrap the driver library
+func LoadNVCUDAMgmt(nvcudaLibPaths []string) (int, *C.nvcuda_handle_t, string) {
 // Returns: num devices, handle, libPath, error
 func loadNVCUDAMgmt(nvcudaLibPaths []string) (int, *C.nvcuda_handle_t, string, error) {
 	var resp C.nvcuda_init_resp_t
 	resp.ch.verbose = getVerboseState()
 	var err error
 	for _, libPath := range nvcudaLibPaths {
 		lib := C.CString(libPath)
 		defer C.free(unsafe.Pointer(lib))
 		C.nvcuda_init(lib, &resp)
 		if resp.err != nil {
 			// Decide what log level based on the type of error message to help users understand why
 			msg := C.GoString(resp.err)
 			switch resp.cudaErr {
 			case C.CUDA_ERROR_INSUFFICIENT_DRIVER, C.CUDA_ERROR_SYSTEM_DRIVER_MISMATCH:
-				err = fmt.Errorf("version mismatch between driver and cuda driver library - reboot or upgrade may be required: library %s", libPath)
+				slog.Warn("version mismatch between driver and cuda driver library - reboot or upgrade may be required", "library", libPath, "error", msg)
 				slog.Warn(err.Error())
 			case C.CUDA_ERROR_NO_DEVICE:
-				err = fmt.Errorf("no nvidia devices detected by library %s", libPath)
+				slog.Info("no nvidia devices detected", "library", libPath)
 				slog.Info(err.Error())
 			case C.CUDA_ERROR_UNKNOWN:
-				err = fmt.Errorf("unknown error initializing cuda driver library %s: %s. see https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for more information", libPath, C.GoString(resp.err))
+				slog.Warn("unknown error initializing cuda driver library", "library", libPath, "error", msg)
-				slog.Warn(err.Error())
+				slog.Warn("see https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for more information")
 			default:
 				msg := C.GoString(resp.err)
 				if strings.Contains(msg, "wrong ELF class") {
 					slog.Debug("skipping 32bit library", "library", libPath)
 				} else {
-					err = fmt.Errorf("Unable to load cudart library %s: %s", libPath, C.GoString(resp.err))
+					slog.Info("unable to load cuda driver library", "library", libPath, "error", msg)
 					slog.Info(err.Error())
 				}
 			}
 			C.free(unsafe.Pointer(resp.err))
 		} else {
-			err = nil
+			return int(resp.num_devices), &resp.ch, libPath
 			return int(resp.num_devices), &resp.ch, libPath, err
 		}
 	}
-	return 0, nil, "", err
+	return 0, nil, ""
 }
-// Bootstrap the management library
+func LoadNVMLMgmt(nvmlLibPaths []string) (*C.nvml_handle_t, string) {
 // Returns: handle, libPath, error
 func loadNVMLMgmt(nvmlLibPaths []string) (*C.nvml_handle_t, string, error) {
 	var resp C.nvml_init_resp_t
 	resp.ch.verbose = getVerboseState()
 	var err error
 	for _, libPath := range nvmlLibPaths {
 		lib := C.CString(libPath)
 		defer C.free(unsafe.Pointer(lib))
 		C.nvml_init(lib, &resp)
 		if resp.err != nil {
-			err = fmt.Errorf("Unable to load NVML management library %s: %s", libPath, C.GoString(resp.err))
+			slog.Info(fmt.Sprintf("Unable to load NVML management library %s: %s", libPath, C.GoString(resp.err)))
 			slog.Info(err.Error())
 			C.free(unsafe.Pointer(resp.err))
 		} else {
-			err = nil
+			return &resp.ch, libPath
 			return &resp.ch, libPath, err
 		}
 	}
-	return nil, "", err
+	return nil, ""
 }
-// bootstrap the Intel GPU library
+func LoadOneapiMgmt(oneapiLibPaths []string) (int, *C.oneapi_handle_t, string) {
 // Returns: num devices, handle, libPath, error
 func loadOneapiMgmt(oneapiLibPaths []string) (int, *C.oneapi_handle_t, string, error) {
 	var resp C.oneapi_init_resp_t
 	num_devices := 0
 	resp.oh.verbose = getVerboseState()
 	var err error
 	for _, libPath := range oneapiLibPaths {
 		lib := C.CString(libPath)
 		defer C.free(unsafe.Pointer(lib))
 		C.oneapi_init(lib, &resp)
 		if resp.err != nil {
-			err = fmt.Errorf("Unable to load oneAPI management library %s: %s", libPath, C.GoString(resp.err))
+			slog.Debug("Unable to load oneAPI management library", "library", libPath, "error", C.GoString(resp.err))
 			slog.Debug(err.Error())
 			C.free(unsafe.Pointer(resp.err))
 		} else {
 			err = nil
 			for i := range resp.oh.num_drivers {
 				num_devices += int(C.oneapi_get_device_count(resp.oh, C.int(i)))
 			}
-			return num_devices, &resp.oh, libPath, err
+			return num_devices, &resp.oh, libPath
 		}
 	}
-	return 0, nil, "", err
+	return 0, nil, ""
 }
 func getVerboseState() C.uint16_t {
@ -728,23 +669,3 @@ func LibraryDir() string {
 	slog.Warn("unable to locate gpu dependency libraries")
 	return ""
 }
 func GetSystemInfo() SystemInfo {
 	gpus := GetGPUInfo()
 	gpuMutex.Lock()
 	defer gpuMutex.Unlock()
 	discoveryErrors := []string{}
 	for _, err := range bootstrapErrors {
 		discoveryErrors = append(discoveryErrors, err.Error())
 	}
 	if len(gpus) == 1 && gpus[0].Library == "cpu" {
 		gpus = []GpuInfo{}
 	}
 	return SystemInfo{
 		System:          cpus[0],
 		GPUs:            gpus,
 		UnsupportedGPUs: unsupportedGPUs,
 		DiscoveryErrors: discoveryErrors,
 	}
 }
--- a/gpu/gpu_darwin.go
+++ b/gpu/gpu_darwin.go
@ -10,9 +10,7 @@ package gpu
 import "C"
 import (
 	"log/slog"
 	"runtime"
 	"syscall"
 	"github.com/ollama/ollama/format"
 )
@ -68,34 +66,3 @@ func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) {
 	// No-op on darwin
 	return "", ""
 }
 func GetSystemInfo() SystemInfo {
 	mem, _ := GetCPUMem()
 	query := "hw.perflevel0.physicalcpu"
 	perfCores, err := syscall.SysctlUint32(query)
 	if err != nil {
 		slog.Warn("failed to discover physical CPU details", "query", query, "error", err)
 	}
 	query = "hw.perflevel1.physicalcpu"
 	efficiencyCores, _ := syscall.SysctlUint32(query) // On x86 xeon this wont return data
 	// Determine thread count
 	query = "hw.logicalcpu"
 	logicalCores, _ := syscall.SysctlUint32(query)
 	return SystemInfo{
 		System: CPUInfo{
 			GpuInfo: GpuInfo{
 				memInfo: mem,
 			},
 			CPUs: []CPU{
 				{
 					CoreCount:           int(perfCores + efficiencyCores),
 					EfficiencyCoreCount: int(efficiencyCores),
 					ThreadCount:         int(logicalCores),
 				},
 			},
 		},
 		GPUs: GetGPUInfo(),
 	}
 }
--- a/gpu/gpu_linux.go
+++ b/gpu/gpu_linux.go
@ -4,8 +4,6 @@ import (
 	"bufio"
 	"fmt"
 	"os"
 	"reflect"
 	"regexp"
 	"strings"
 	"github.com/ollama/ollama/format"
@ -92,95 +90,3 @@ func GetCPUMem() (memInfo, error) {
 	}
 	return mem, nil
 }
 const CpuInfoFilename = "/proc/cpuinfo"
 type linuxCpuInfo struct {
 	ID         string `cpuinfo:"processor"`
 	VendorID   string `cpuinfo:"vendor_id"`
 	ModelName  string `cpuinfo:"model name"`
 	PhysicalID string `cpuinfo:"physical id"`
 	Siblings   string `cpuinfo:"siblings"`
 	CoreID     string `cpuinfo:"core id"`
 }
 func GetCPUDetails() ([]CPU, error) {
 	file, err := os.Open(CpuInfoFilename)
 	if err != nil {
 		return nil, err
 	}
 	reColumns := regexp.MustCompile("\t+: ")
 	scanner := bufio.NewScanner(file)
 	cpuInfos := []linuxCpuInfo{}
 	cpu := &linuxCpuInfo{}
 	for scanner.Scan() {
 		line := scanner.Text()
 		if sl := reColumns.Split(line, 2); len(sl) > 1 {
 			t := reflect.TypeOf(cpu).Elem()
 			s := reflect.ValueOf(cpu).Elem()
 			for i := range t.NumField() {
 				field := t.Field(i)
 				tag := field.Tag.Get("cpuinfo")
 				if tag == sl[0] {
 					s.FieldByName(field.Name).SetString(sl[1])
 					break
 				}
 			}
 		} else if strings.TrimSpace(line) == "" && cpu.ID != "" {
 			cpuInfos = append(cpuInfos, *cpu)
 			cpu = &linuxCpuInfo{}
 		}
 	}
 	// Process the sockets/cores/threads
 	socketByID := map[string]*CPU{}
 	coreBySocket := map[string]map[string]struct{}{}
 	threadsByCoreBySocket := map[string]map[string]int{}
 	for _, c := range cpuInfos {
 		if _, found := socketByID[c.PhysicalID]; !found {
 			socketByID[c.PhysicalID] = &CPU{
 				ID:        c.PhysicalID,
 				VendorID:  c.VendorID,
 				ModelName: c.ModelName,
 			}
 			coreBySocket[c.PhysicalID] = map[string]struct{}{}
 			threadsByCoreBySocket[c.PhysicalID] = map[string]int{}
 		}
 		if c.CoreID != "" {
 			coreBySocket[c.PhysicalID][c.PhysicalID+":"+c.CoreID] = struct{}{}
 			threadsByCoreBySocket[c.PhysicalID][c.PhysicalID+":"+c.CoreID]++
 		} else {
 			coreBySocket[c.PhysicalID][c.PhysicalID+":"+c.ID] = struct{}{}
 			threadsByCoreBySocket[c.PhysicalID][c.PhysicalID+":"+c.ID]++
 		}
 	}
 	// Tally up the values from the tracking maps
 	for id, s := range socketByID {
 		s.CoreCount = len(coreBySocket[id])
 		s.ThreadCount = 0
 		for _, tc := range threadsByCoreBySocket[id] {
 			s.ThreadCount += tc
 		}
 		// This only works if HT is enabled, consider a more reliable model, maybe cache size comparisons?
 		efficiencyCoreCount := 0
 		for _, threads := range threadsByCoreBySocket[id] {
 			if threads == 1 {
 				efficiencyCoreCount++
 			}
 		}
 		if efficiencyCoreCount == s.CoreCount {
 			// 1:1 mapping means they're not actually efficiency cores, but regular cores
 			s.EfficiencyCoreCount = 0
 		} else {
 			s.EfficiencyCoreCount = efficiencyCoreCount
 		}
 	}
 	result := []CPU{}
 	for _, c := range socketByID {
 		result = append(result, *c)
 	}
 	return result, nil
 }
--- a/gpu/gpu_windows.go
+++ b/gpu/gpu_windows.go
@ -2,7 +2,6 @@ package gpu
 import (
 	"fmt"
 	"log/slog"
 	"syscall"
 	"unsafe"
 )
@ -23,7 +22,6 @@ var (
 	k32                      = syscall.NewLazyDLL("kernel32.dll")
 	globalMemoryStatusExProc = k32.NewProc("GlobalMemoryStatusEx")
 	sizeofMemoryStatusEx     = uint32(unsafe.Sizeof(MEMORYSTATUSEX{}))
 	GetLogicalProcessorInformationEx = k32.NewProc("GetLogicalProcessorInformationEx")
 )
 var CudartGlobs = []string{
@ -57,178 +55,3 @@ func GetCPUMem() (memInfo, error) {
 	}
 	return memInfo{TotalMemory: memStatus.TotalPhys, FreeMemory: memStatus.AvailPhys, FreeSwap: memStatus.AvailPageFile}, nil
 }
 type LOGICAL_PROCESSOR_RELATIONSHIP uint32
 const (
 	RelationProcessorCore LOGICAL_PROCESSOR_RELATIONSHIP = iota
 	RelationNumaNode
 	RelationCache
 	RelationProcessorPackage
 	RelationGroup
 	RelationProcessorDie
 	RelationNumaNodeEx
 	RelationProcessorModule
 )
 const RelationAll LOGICAL_PROCESSOR_RELATIONSHIP = 0xffff
 type GROUP_AFFINITY struct {
 	Mask     uintptr // KAFFINITY
 	Group    uint16
 	Reserved [3]uint16
 }
 type PROCESSOR_RELATIONSHIP struct {
 	Flags           byte
 	EfficiencyClass byte
 	Reserved        [20]byte
 	GroupCount      uint16
 	GroupMask       [1]GROUP_AFFINITY // len GroupCount
 }
 // Omitted unused structs: NUMA_NODE_RELATIONSHIP CACHE_RELATIONSHIP GROUP_RELATIONSHIP
 type SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX struct {
 	Relationship LOGICAL_PROCESSOR_RELATIONSHIP
 	Size         uint32
 	U            [1]byte // Union len Size
 	// PROCESSOR_RELATIONSHIP
 	// NUMA_NODE_RELATIONSHIP
 	// CACHE_RELATIONSHIP
 	// GROUP_RELATIONSHIP
 }
 func (group *GROUP_AFFINITY) IsMember(target *GROUP_AFFINITY) bool {
 	if group == nil || target == nil {
 		return false
 	}
 	return group.Mask&target.Mask != 0
 }
 type winPackage struct {
 	groups              []*GROUP_AFFINITY
 	coreCount           int // performance cores = coreCount - efficiencyCoreCount
 	efficiencyCoreCount int
 	threadCount         int
 }
 func (pkg *winPackage) IsMember(target *GROUP_AFFINITY) bool {
 	for _, group := range pkg.groups {
 		if group.IsMember(target) {
 			return true
 		}
 	}
 	return false
 }
 func getLogicalProcessorInformationEx() ([]byte, error) {
 	buf := make([]byte, 1)
 	bufSize := len(buf)
 	ret, _, err := GetLogicalProcessorInformationEx.Call(
 		uintptr(RelationAll),
 		uintptr(unsafe.Pointer(&buf[0])),
 		uintptr(unsafe.Pointer(&bufSize)),
 	)
 	if ret != 0 {
 		return nil, fmt.Errorf("failed to determine size info ret:%d %w", ret, err)
 	}
 	buf = make([]byte, bufSize)
 	ret, _, err = GetLogicalProcessorInformationEx.Call(
 		uintptr(RelationAll),
 		uintptr(unsafe.Pointer(&buf[0])),
 		uintptr(unsafe.Pointer(&bufSize)),
 	)
 	if ret == 0 {
 		return nil, fmt.Errorf("failed to gather processor information ret:%d buflen:%d %w", ret, bufSize, err)
 	}
 	return buf, nil
 }
 func processSystemLogicalProcessorInforationList(buf []byte) []*winPackage {
 	var slpi *SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX
 	// Find all the packages first
 	packages := []*winPackage{}
 	for bufOffset := 0; bufOffset < len(buf); bufOffset += int(slpi.Size) {
 		slpi = (*SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)(unsafe.Pointer(&buf[bufOffset]))
 		if slpi.Relationship != RelationProcessorPackage {
 			continue
 		}
 		pr := (*PROCESSOR_RELATIONSHIP)(unsafe.Pointer(&slpi.U[0]))
 		pkg := &winPackage{}
 		ga0 := unsafe.Pointer(&pr.GroupMask[0])
 		for j := range pr.GroupCount {
 			gm := (*GROUP_AFFINITY)(unsafe.Pointer(uintptr(ga0) + uintptr(j)*unsafe.Sizeof(GROUP_AFFINITY{})))
 			pkg.groups = append(pkg.groups, gm)
 		}
 		packages = append(packages, pkg)
 	}
 	slog.Info("packages", "count", len(packages))
 	// To identify efficiency cores we have to compare the relative values
 	// Larger values are "less efficient" (aka, more performant)
 	var maxEfficiencyClass byte
 	for bufOffset := 0; bufOffset < len(buf); bufOffset += int(slpi.Size) {
 		slpi = (*SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)(unsafe.Pointer(&buf[bufOffset]))
 		if slpi.Relationship != RelationProcessorCore {
 			continue
 		}
 		pr := (*PROCESSOR_RELATIONSHIP)(unsafe.Pointer(&slpi.U[0]))
 		if pr.EfficiencyClass > maxEfficiencyClass {
 			maxEfficiencyClass = pr.EfficiencyClass
 		}
 	}
 	if maxEfficiencyClass > 0 {
 		slog.Info("efficiency cores detected", "maxEfficiencyClass", maxEfficiencyClass)
 	}
 	// then match up the Cores to the Packages, count up cores, threads and efficiency cores
 	for bufOffset := 0; bufOffset < len(buf); bufOffset += int(slpi.Size) {
 		slpi = (*SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX)(unsafe.Pointer(&buf[bufOffset]))
 		if slpi.Relationship != RelationProcessorCore {
 			continue
 		}
 		pr := (*PROCESSOR_RELATIONSHIP)(unsafe.Pointer(&slpi.U[0]))
 		ga0 := unsafe.Pointer(&pr.GroupMask[0])
 		for j := range pr.GroupCount {
 			gm := (*GROUP_AFFINITY)(unsafe.Pointer(uintptr(ga0) + uintptr(j)*unsafe.Sizeof(GROUP_AFFINITY{})))
 			for _, pkg := range packages {
 				if pkg.IsMember(gm) {
 					pkg.coreCount++
 					if pr.Flags == 0 {
 						pkg.threadCount++
 					} else {
 						pkg.threadCount += 2
 					}
 					if pr.EfficiencyClass < maxEfficiencyClass {
 						pkg.efficiencyCoreCount++
 					}
 				}
 			}
 		}
 	}
 	// Sumarize the results
 	for i, pkg := range packages {
 		slog.Info("", "package", i, "cores", pkg.coreCount, "efficiency", pkg.efficiencyCoreCount, "threads", pkg.threadCount)
 	}
 	return packages
 }
 func GetCPUDetails() ([]CPU, error) {
 	buf, err := getLogicalProcessorInformationEx()
 	if err != nil {
 		return nil, err
 	}
 	packages := processSystemLogicalProcessorInforationList(buf)
 	cpus := make([]CPU, len(packages))
 	for i, pkg := range packages {
 		cpus[i].CoreCount = pkg.coreCount
 		cpus[i].EfficiencyCoreCount = pkg.efficiencyCoreCount
 		cpus[i].ThreadCount = pkg.threadCount
 	}
 	return cpus, nil
 }
--- a/gpu/gpu_windows_test.go
+++ b/gpu/gpu_windows_test.go
--- a/gpu/types.go
+++ b/gpu/types.go
@ -10,11 +10,11 @@ import (
 type memInfo struct {
 	TotalMemory uint64 `json:"total_memory,omitempty"`
 	FreeMemory  uint64 `json:"free_memory,omitempty"`
-	FreeSwap    uint64 `json:"free_swap,omitempty"` // TODO split this out for system only
+	FreeSwap    uint64 `json:"free_swap,omitempty"`
 }
 // Beginning of an `ollama info` command
-type GpuInfo struct { // TODO better name maybe "InferenceProcessor"?
+type GpuInfo struct {
 	memInfo
 	Library string `json:"library,omitempty"`
@ -49,17 +49,6 @@ type GpuInfo struct { // TODO better name maybe "InferenceProcessor"?
 type CPUInfo struct {
 	GpuInfo
 	CPUs []CPU
 }
 // CPU type represents a CPU Package occupying a socket
 type CPU struct {
 	ID                  string `cpuinfo:"processor"`
 	VendorID            string `cpuinfo:"vendor_id"`
 	ModelName           string `cpuinfo:"model name"`
 	CoreCount           int
 	EfficiencyCoreCount int // Performance = CoreCount - Efficiency
 	ThreadCount         int
 }
 type CudaGPUInfo struct {
@ -87,11 +76,6 @@ type OneapiGPUInfoList []OneapiGPUInfo
 type GpuInfoList []GpuInfo
 type UnsupportedGPUInfo struct {
 	GpuInfo
 	Reason string `json:"reason"`
 }
 // Split up the set of gpu info's by Library and variant
 func (l GpuInfoList) ByLibrary() []GpuInfoList {
 	resp := []GpuInfoList{}
@ -162,19 +146,3 @@ func (c CPUCapability) String() string {
 		return "no vector extensions"
 	}
 }
 type SystemInfo struct {
 	System          CPUInfo              `json:"system"`
 	GPUs            []GpuInfo            `json:"gpus"`
 	UnsupportedGPUs []UnsupportedGPUInfo `json:"unsupported_gpus"`
 	DiscoveryErrors []string             `json:"discovery_errors"`
 }
 // Return the optimal number of threads to use for inference
 func (si SystemInfo) GetOptimalThreadCount() int {
 	if len(si.System.CPUs) == 0 {
 		return 0
 	}
 	// Allocate thread count matching the performance cores on a single socket
 	return si.System.CPUs[0].CoreCount - si.System.CPUs[0].EfficiencyCoreCount
 }
--- a/llama/llama.go
+++ b/llama/llama.go
@ -3,12 +3,12 @@ package llama
 /*
 #cgo CFLAGS: -O2 -std=c11 -DGGML_BUILD=1 -DNDEBUG -DLOG_DISABLE_LOGS -DGGML_USE_LLAMAFILE
 #cgo CXXFLAGS: -O2 -std=c++11 -DGGML_BUILD=1 -DNDEBUG -DLOG_DISABLE_LOGS -DGGML_USE_LLAMAFILE
-#cgo darwin,arm64 CFLAGS: -DGGML_USE_METAL -DGGML_USE_ACCELERATE -DGGML_METAL_EMBED_LIBRARY -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64 -DGGML_USE_BLAS -mmacosx-version-min=11.3
+#cgo darwin,arm64 CFLAGS: -DGGML_USE_METAL -DGGML_USE_ACCELERATE -DGGML_METAL_EMBED_LIBRARY -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64 -DGGML_USE_BLAS
-#cgo darwin,arm64 CXXFLAGS: -DGGML_USE_METAL -DGGML_USE_ACCELERATE -DGGML_METAL_EMBED_LIBRARY -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64 -DGGML_USE_BLAS -mmacosx-version-min=11.3
+#cgo darwin,arm64 CXXFLAGS: -DGGML_USE_METAL -DGGML_USE_ACCELERATE -DGGML_METAL_EMBED_LIBRARY -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64 -DGGML_USE_BLAS
-#cgo darwin,arm64 LDFLAGS: -framework Foundation -framework Metal -framework MetalKit -framework Accelerate -mmacosx-version-min=11.3
+#cgo darwin,arm64 LDFLAGS: -framework Foundation -framework Metal -framework MetalKit -framework Accelerate
-#cgo darwin,amd64 CFLAGS: -Wno-incompatible-pointer-types-discards-qualifiers -mmacosx-version-min=11.3
+#cgo darwin,amd64 CFLAGS: -Wno-incompatible-pointer-types-discards-qualifiers
-#cgo darwin,amd64 CXXFLAGS: -Wno-incompatible-pointer-types-discards-qualifiers -mmacosx-version-min=11.3
+#cgo darwin,amd64 CXXFLAGS: -Wno-incompatible-pointer-types-discards-qualifiers
-#cgo darwin,amd64 LDFLAGS: -framework Foundation -mmacosx-version-min=11.3
+#cgo darwin,amd64 LDFLAGS: -framework Foundation
 #cgo darwin,amd64,avx2 CFLAGS: -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
 #cgo darwin,amd64,avx2 CXXFLAGS: -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
 #cgo darwin,amd64,avx2 LDFLAGS: -framework Accelerate
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@ -251,7 +251,7 @@ if [ -z "${OLLAMA_SKIP_ROCM_GENERATE}" -a -d "${ROCM_PATH}" ]; then
        ROCM_VARIANT=_v$(ls ${ROCM_PATH}/lib/librocblas.so.*.*.????? | cut -f5 -d. || true)
    fi
    init_vars
-    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DGGML_HIPBLAS=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
+    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DGGML_HIPBLAS=on -DGGML_CUDA_NO_PEER_COPY=on -DCMAKE_C_COMPILER=$ROCM_PATH/llvm/bin/clang -DCMAKE_CXX_COMPILER=$ROCM_PATH/llvm/bin/clang++ -DAMDGPU_TARGETS=$(amdGPUs) -DGPU_TARGETS=$(amdGPUs)"
    # Users building from source can tune the exact flags we pass to cmake for configuring llama.cpp
    if [ -n "${OLLAMA_CUSTOM_ROCM_DEFS}" ]; then
        echo "OLLAMA_CUSTOM_ROCM_DEFS=\"${OLLAMA_CUSTOM_ROCM_DEFS}\""
--- a/llm/generate/gen_windows.ps1
+++ b/llm/generate/gen_windows.ps1
@ -340,6 +340,7 @@ function build_rocm() {
            "-DCMAKE_C_COMPILER=clang.exe",
            "-DCMAKE_CXX_COMPILER=clang++.exe",
            "-DGGML_HIPBLAS=on",
            "-DGGML_CUDA_NO_PEER_COPY=on",
            "-DHIP_PLATFORM=amd",
            "-DGGML_AVX=on",
            "-DGGML_AVX2=off",
--- a/llm/ggml.go
+++ b/llm/ggml.go
@ -244,8 +244,6 @@ func (t Tensor) typeSize() uint64 {
 		return 8
 	case 29: // IQ1_M
 		return blockSize/8 + blockSize/16 + blockSize/32
 	case 30: // BF16
 		return 2
 	default:
 		return 0
 	}
--- a/llm/server.go
+++ b/llm/server.go
@ -98,11 +98,15 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 	var systemFreeMemory uint64
 	var systemSwapFreeMemory uint64
-	systemInfo := gpu.GetSystemInfo()
+	systemMemInfo, err := gpu.GetCPUMem()
-	systemTotalMemory = systemInfo.System.TotalMemory
+	if err != nil {
-	systemFreeMemory = systemInfo.System.FreeMemory
+		slog.Error("failed to lookup system memory", "error", err)
-	systemSwapFreeMemory = systemInfo.System.FreeSwap
+	} else {
 		systemTotalMemory = systemMemInfo.TotalMemory
 		systemFreeMemory = systemMemInfo.FreeMemory
 		systemSwapFreeMemory = systemMemInfo.FreeSwap
 		slog.Info("system memory", "total", format.HumanBytes2(systemTotalMemory), "free", format.HumanBytes2(systemFreeMemory), "free_swap", format.HumanBytes2(systemSwapFreeMemory))
 	}
 	// If the user wants zero GPU layers, reset the gpu list to be CPU/system ram info
 	if opts.NumGPU == 0 {
@ -213,11 +217,8 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		params = append(params, "--mmproj", projectors[0])
 	}
 	defaultThreads := systemInfo.GetOptimalThreadCount()
 	if opts.NumThread > 0 {
 		params = append(params, "--threads", strconv.Itoa(opts.NumThread))
 	} else if defaultThreads > 0 {
 		params = append(params, "--threads", strconv.Itoa(defaultThreads))
 	}
 	if !opts.F16KV {
@ -259,7 +260,15 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		params = append(params, "--mlock")
 	}
-	// TODO - NUMA support currently doesn't work properly
+	if gpu.IsNUMA() && gpus[0].Library == "cpu" {
 		numaMode := "distribute"
 		if runtime.GOOS == "linux" {
 			if _, err := exec.LookPath("numactl"); err == nil {
 				numaMode = "numactl"
 			}
 		}
 		params = append(params, "--numa", numaMode)
 	}
 	params = append(params, "--parallel", strconv.Itoa(numParallel))