diff --git a/gpu/amd.go b/gpu/amd.go new file mode 100644 index 00000000..84af6b37 --- /dev/null +++ b/gpu/amd.go @@ -0,0 +1,91 @@ +package gpu + +import ( + "bufio" + "fmt" + "io" + "log/slog" + "os" + "path/filepath" + "strconv" + "strings" +) + +// TODO - windows vs. non-windows vs darwin + +// Discovery logic for AMD/ROCm GPUs + +const ( + DriverVersionFile = "/sys/module/amdgpu/version" + GPUPropertiesFileGlob = "/sys/class/kfd/kfd/topology/nodes/*/properties" + // TODO probably break these down per GPU to make the logic simpler + GPUTotalMemoryFileGlob = "/sys/class/kfd/kfd/topology/nodes/*/mem_banks/*/properties" // size_in_bytes line + GPUUsedMemoryFileGlob = "/sys/class/kfd/kfd/topology/nodes/*/mem_banks/*/used_memory" +) + +func AMDDetected() bool { + _, err := AMDDriverVersion() + return err == nil +} + +func AMDDriverVersion() (string, error) { + _, err := os.Stat(DriverVersionFile) + if err != nil { + return "", err + } + fp, err := os.Open(DriverVersionFile) + if err != nil { + return "", err + } + defer fp.Close() + verString, err := io.ReadAll(fp) + if err != nil { + return "", err + } + return strings.TrimSpace(string(verString)), nil +} + +func AMDGFXVersions() []Version { + res := []Version{} + matches, _ := filepath.Glob(GPUPropertiesFileGlob) + for _, match := range matches { + fp, err := os.Open(match) + if err != nil { + slog.Debug(fmt.Sprintf("failed to open sysfs node file %s: %s", match, err)) + continue + } + defer fp.Close() + + scanner := bufio.NewScanner(fp) + // optionally, resize scanner's capacity for lines over 64K, see next example + for scanner.Scan() { + line := strings.TrimSpace(scanner.Text()) + if strings.HasPrefix(line, "gfx_target_version") { + ver := strings.Fields(line) + if len(ver) != 2 || len(ver[1]) < 5 { + slog.Debug("malformed " + line) + continue + } + l := len(ver[1]) + patch, err1 := strconv.ParseUint(ver[1][l-2:l], 10, 32) + minor, err2 := strconv.ParseUint(ver[1][l-4:l-2], 10, 32) + major, err3 := strconv.ParseUint(ver[1][:l-4], 10, 32) + if err1 != nil || err2 != nil || err3 != nil { + slog.Debug("malformed int " + line) + continue + } + + res = append(res, Version{ + Major: uint(major), + Minor: uint(minor), + Patch: uint(patch), + }) + } + } + } + return res +} + +func (v Version) ToGFXString() string { + return fmt.Sprintf("gfx%d%d%d", v.Major, v.Minor, v.Patch) +} diff --git a/gpu/gpu.go b/gpu/gpu.go index 6e67e653..3831ee4a 100644 --- a/gpu/gpu.go +++ b/gpu/gpu.go @@ -149,43 +149,63 @@ func GetGPUInfo() GpuInfo { slog.Info(fmt.Sprintf("CUDA GPU is too old. Falling back to CPU mode. Compute Capability detected: %d.%d", cc.major, cc.minor)) } } - } else if gpuHandles.rocm != nil && (cpuVariant != "" || runtime.GOARCH != "amd64") { - C.rocm_check_vram(*gpuHandles.rocm, &memInfo) - if memInfo.err != nil { - slog.Info(fmt.Sprintf("error looking up ROCm GPU memory: %s", C.GoString(memInfo.err))) - C.free(unsafe.Pointer(memInfo.err)) - } else if memInfo.igpu_index >= 0 && memInfo.count == 1 { - // Only one GPU detected and it appears to be an integrated GPU - skip it - slog.Info("ROCm unsupported integrated GPU detected") - } else if memInfo.count > 0 { - if memInfo.igpu_index >= 0 { - // We have multiple GPUs reported, and one of them is an integrated GPU - // so we have to set the env var to bypass it - // If the user has specified their own ROCR_VISIBLE_DEVICES, don't clobber it - val := os.Getenv("ROCR_VISIBLE_DEVICES") - if val == "" { - devices := []string{} - for i := 0; i < int(memInfo.count); i++ { - if i == int(memInfo.igpu_index) { - continue + } else if AMDDetected() && gpuHandles.rocm != nil && (cpuVariant != "" || runtime.GOARCH != "amd64") { + ver, err := AMDDriverVersion() + if err == nil { + slog.Info("AMD Driver: " + ver) + } + gfx := AMDGFXVersions() + tooOld := false + for _, v := range gfx { + if v.Major < 9 { + slog.Info("AMD GPU too old, falling back to CPU " + v.ToGFXString()) + tooOld = true + break + } + + // TODO - remap gfx strings for unsupporetd minor/patch versions to supported for the same major + // e.g. gfx1034 works if we map it to gfx1030 at runtime + + } + if !tooOld { + // TODO - this algo can be shifted over to use sysfs instead of the rocm info library... + C.rocm_check_vram(*gpuHandles.rocm, &memInfo) + if memInfo.err != nil { + slog.Info(fmt.Sprintf("error looking up ROCm GPU memory: %s", C.GoString(memInfo.err))) + C.free(unsafe.Pointer(memInfo.err)) + } else if memInfo.igpu_index >= 0 && memInfo.count == 1 { + // Only one GPU detected and it appears to be an integrated GPU - skip it + slog.Info("ROCm unsupported integrated GPU detected") + } else if memInfo.count > 0 { + if memInfo.igpu_index >= 0 { + // We have multiple GPUs reported, and one of them is an integrated GPU + // so we have to set the env var to bypass it + // If the user has specified their own ROCR_VISIBLE_DEVICES, don't clobber it + val := os.Getenv("ROCR_VISIBLE_DEVICES") + if val == "" { + devices := []string{} + for i := 0; i < int(memInfo.count); i++ { + if i == int(memInfo.igpu_index) { + continue + } + devices = append(devices, strconv.Itoa(i)) } - devices = append(devices, strconv.Itoa(i)) + val = strings.Join(devices, ",") + os.Setenv("ROCR_VISIBLE_DEVICES", val) } - val = strings.Join(devices, ",") - os.Setenv("ROCR_VISIBLE_DEVICES", val) + slog.Info(fmt.Sprintf("ROCm integrated GPU detected - ROCR_VISIBLE_DEVICES=%s", val)) } - slog.Info(fmt.Sprintf("ROCm integrated GPU detected - ROCR_VISIBLE_DEVICES=%s", val)) + resp.Library = "rocm" + var version C.rocm_version_resp_t + C.rocm_get_version(*gpuHandles.rocm, &version) + verString := C.GoString(version.str) + if version.status == 0 { + resp.Variant = "v" + verString + } else { + slog.Info(fmt.Sprintf("failed to look up ROCm version: %s", verString)) + } + C.free(unsafe.Pointer(version.str)) } - resp.Library = "rocm" - var version C.rocm_version_resp_t - C.rocm_get_version(*gpuHandles.rocm, &version) - verString := C.GoString(version.str) - if version.status == 0 { - resp.Variant = "v" + verString - } else { - slog.Info(fmt.Sprintf("failed to look up ROCm version: %s", verString)) - } - C.free(unsafe.Pointer(version.str)) } } if resp.Library == "" { diff --git a/gpu/types.go b/gpu/types.go index 24fa4a24..67727180 100644 --- a/gpu/types.go +++ b/gpu/types.go @@ -16,3 +16,9 @@ type GpuInfo struct { // TODO add other useful attributes about the card here for discovery information } + +type Version struct { + Major uint + Minor uint + Patch uint +} diff --git a/llm/generate/gen_linux.sh b/llm/generate/gen_linux.sh index 65ca602e..e6a7d077 100755 --- a/llm/generate/gen_linux.sh +++ b/llm/generate/gen_linux.sh @@ -21,7 +21,6 @@ amdGPUs() { return fi GPU_LIST=( - "gfx803" "gfx900" "gfx906:xnack-" "gfx908:xnack-" diff --git a/llm/payload_common.go b/llm/payload_common.go index d0218979..3958b9f5 100644 --- a/llm/payload_common.go +++ b/llm/payload_common.go @@ -90,6 +90,7 @@ func getDynLibs(gpuInfo gpu.GpuInfo) []string { if len(dynLibs) == 0 { dynLibs = []string{availableDynLibs["cpu"]} } + slog.Debug(fmt.Sprintf("ordered list of LLM libraries to try %v", dynLibs)) return dynLibs }