diff --git a/gpu/amd_linux.go b/gpu/amd_linux.go index 15b6fc61..6493af9e 100644 --- a/gpu/amd_linux.go +++ b/gpu/amd_linux.go @@ -10,6 +10,7 @@ import ( "path/filepath" "regexp" "slices" + "sort" "strconv" "strings" @@ -82,6 +83,20 @@ func AMDGetGPUInfo() []RocmGPUInfo { // The amdgpu driver always exposes the host CPU(s) first, but we have to skip them and subtract // from the other IDs to get alignment with the HIP libraries expectations (zero is the first GPU, not the CPU) matches, _ := filepath.Glob(GPUPropertiesFileGlob) + sort.Slice(matches, func(i, j int) bool { + // /sys/class/kfd/kfd/topology/nodes//properties + a, err := strconv.ParseInt(filepath.Base(filepath.Dir(matches[i])), 10, 64) + if err != nil { + slog.Debug("parse err", "error", err, "match", matches[i]) + return false + } + b, err := strconv.ParseInt(filepath.Base(filepath.Dir(matches[j])), 10, 64) + if err != nil { + slog.Debug("parse err", "error", err, "match", matches[i]) + return false + } + return a < b + }) cpuCount := 0 for _, match := range matches { slog.Debug("evaluating amdgpu node " + match)