Skip to content

Commit

Permalink
Merge pull request #4238 from dhiltgen/gpu_info
Browse files Browse the repository at this point in the history
Record more GPU information
  • Loading branch information
dhiltgen committed May 9, 2024
2 parents d0425f2 + 8727a9c commit dc18eee
Show file tree
Hide file tree
Showing 10 changed files with 150 additions and 96 deletions.
15 changes: 10 additions & 5 deletions gpu/amd_hip_windows.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ package gpu
import (
"fmt"
"log/slog"
"strconv"
"syscall"
"unsafe"

Expand Down Expand Up @@ -74,16 +73,22 @@ func (hl *HipLib) Release() {
hl.dll = 0
}

func (hl *HipLib) AMDDriverVersion() (string, error) {
func (hl *HipLib) AMDDriverVersion() (driverMajor, driverMinor int, err error) {
if hl.dll == 0 {
return "", fmt.Errorf("dll has been unloaded")
return 0, 0, fmt.Errorf("dll has been unloaded")
}
var version int
status, _, err := syscall.SyscallN(hl.hipDriverGetVersion, uintptr(unsafe.Pointer(&version)))
if status != hipSuccess {
return "", fmt.Errorf("failed call to hipDriverGetVersion: %d %s", status, err)
return 0, 0, fmt.Errorf("failed call to hipDriverGetVersion: %d %s", status, err)
}
return strconv.Itoa(version), nil

slog.Debug("hipDriverGetVersion", "version", version)
// TODO - this isn't actually right, but the docs claim hipDriverGetVersion isn't accurate anyway...
driverMajor = version / 1000
driverMinor = (version - (driverMajor * 1000)) / 10

return driverMajor, driverMinor, nil
}

func (hl *HipLib) HipGetDeviceCount() int {
Expand Down
82 changes: 61 additions & 21 deletions gpu/amd_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"log/slog"
"os"
"path/filepath"
"regexp"
"slices"
"strconv"
"strings"
Expand Down Expand Up @@ -41,10 +42,8 @@ func AMDGetGPUInfo() []GpuInfo {
}

// Opportunistic logging of driver version to aid in troubleshooting
ver, err := AMDDriverVersion()
if err == nil {
slog.Info("AMD Driver: " + ver)
} else {
driverMajor, driverMinor, err := AMDDriverVersion()
if err != nil {
// TODO - if we see users crash and burn with the upstreamed kernel this can be adjusted to hard-fail rocm support and fallback to CPU
slog.Warn("ollama recommends running the https://www.amd.com/en/support/linux-drivers", "error", err)
}
Expand Down Expand Up @@ -91,6 +90,7 @@ func AMDGetGPUInfo() []GpuInfo {
scanner := bufio.NewScanner(fp)
isCPU := false
var major, minor, patch uint64
var vendor, device uint64
for scanner.Scan() {
line := strings.TrimSpace(scanner.Text())
// Note: we could also use "cpu_cores_count X" where X is greater than zero to detect CPUs
Expand Down Expand Up @@ -118,6 +118,26 @@ func AMDGetGPUInfo() []GpuInfo {
slog.Debug("malformed int " + line)
continue
}
} else if strings.HasPrefix(line, "vendor_id") {
ver := strings.Fields(line)
if len(ver) != 2 {
slog.Debug("malformed vendor_id", "vendor_id", line)
continue
}
vendor, err = strconv.ParseUint(ver[1], 10, 32)
if err != nil {
slog.Debug("malformed vendor_id" + line)
}
} else if strings.HasPrefix(line, "device_id") {
ver := strings.Fields(line)
if len(ver) != 2 {
slog.Debug("malformed device_id", "device_id", line)
continue
}
device, err = strconv.ParseUint(ver[1], 10, 32)
if err != nil {
slog.Debug("malformed device_id" + line)
}
}

// TODO - any other properties we want to extract and record?
Expand All @@ -140,7 +160,7 @@ func AMDGetGPUInfo() []GpuInfo {
}

if int(major) < RocmComputeMin {
slog.Warn(fmt.Sprintf("amdgpu too old gfx%d%d%x", major, minor, patch), "gpu", gpuID)
slog.Warn(fmt.Sprintf("amdgpu too old gfx%d%x%x", major, minor, patch), "gpu", gpuID)
continue
}

Expand Down Expand Up @@ -210,24 +230,29 @@ func AMDGetGPUInfo() []GpuInfo {

// iGPU detection, remove this check once we can support an iGPU variant of the rocm library
if totalMemory < IGPUMemLimit {
slog.Info("amdgpu appears to be an iGPU, skipping", "gpu", gpuID, "total", format.HumanBytes2(totalMemory))
slog.Info("unsupported Radeon iGPU detected skipping", "id", gpuID, "total", format.HumanBytes2(totalMemory))
continue
}
var name string
// TODO - PCI ID lookup
if vendor > 0 && device > 0 {
name = fmt.Sprintf("%04x:%04x", vendor, device)
}

slog.Info("amdgpu memory", "gpu", gpuID, "total", format.HumanBytes2(totalMemory))
slog.Info("amdgpu memory", "gpu", gpuID, "available", format.HumanBytes2(totalMemory-usedMemory))
slog.Debug("amdgpu memory", "gpu", gpuID, "total", format.HumanBytes2(totalMemory))
slog.Debug("amdgpu memory", "gpu", gpuID, "available", format.HumanBytes2(totalMemory-usedMemory))
gpuInfo := GpuInfo{
Library: "rocm",
memInfo: memInfo{
TotalMemory: totalMemory,
FreeMemory: (totalMemory - usedMemory),
},
ID: fmt.Sprintf("%d", gpuID),
// Name: not exposed in sysfs directly, would require pci device id lookup
Major: int(major),
Minor: int(minor),
Patch: int(patch),
ID: fmt.Sprintf("%d", gpuID),
Name: name,
Compute: fmt.Sprintf("gfx%d%x%x", major, minor, patch),
MinimumMemory: rocmMinimumMemory,
DriverMajor: driverMajor,
DriverMinor: driverMinor,
}

// If the user wants to filter to a subset of devices, filter out if we aren't a match
Expand Down Expand Up @@ -266,7 +291,7 @@ func AMDGetGPUInfo() []GpuInfo {
}
slog.Debug("rocm supported GPUs", "types", supported)
}
gfx := fmt.Sprintf("gfx%d%d%x", gpuInfo.Major, gpuInfo.Minor, gpuInfo.Patch)
gfx := gpuInfo.Compute
if !slices.Contains[[]string, string](supported, gfx) {
slog.Warn("amdgpu is not supported", "gpu", gpuInfo.ID, "gpu_type", gfx, "library", libDir, "supported_types", supported)
// TODO - consider discrete markdown just for ROCM troubleshooting?
Expand All @@ -276,7 +301,7 @@ func AMDGetGPUInfo() []GpuInfo {
slog.Info("amdgpu is supported", "gpu", gpuInfo.ID, "gpu_type", gfx)
}
} else {
slog.Debug("skipping rocm gfx compatibility check with HSA_OVERRIDE_GFX_VERSION=" + gfxOverride)
slog.Info("skipping rocm gfx compatibility check", "HSA_OVERRIDE_GFX_VERSION", gfxOverride)
}

// The GPU has passed all the verification steps and is supported
Expand Down Expand Up @@ -322,19 +347,34 @@ func AMDValidateLibDir() (string, error) {
return "", fmt.Errorf("no suitable rocm found, falling back to CPU")
}

func AMDDriverVersion() (string, error) {
_, err := os.Stat(DriverVersionFile)
func AMDDriverVersion() (driverMajor, driverMinor int, err error) {
_, err = os.Stat(DriverVersionFile)
if err != nil {
return "", fmt.Errorf("amdgpu version file missing: %s %w", DriverVersionFile, err)
return 0, 0, fmt.Errorf("amdgpu version file missing: %s %w", DriverVersionFile, err)
}
fp, err := os.Open(DriverVersionFile)
if err != nil {
return "", err
return 0, 0, err
}
defer fp.Close()
verString, err := io.ReadAll(fp)
if err != nil {
return "", err
return 0, 0, err
}

pattern := `\A(\d+)\.(\d+).*`
regex := regexp.MustCompile(pattern)
match := regex.FindStringSubmatch(string(verString))
if len(match) < 2 {
return 0, 0, fmt.Errorf("malformed version string %s", string(verString))
}
driverMajor, err = strconv.Atoi(match[1])
if err != nil {
return 0, 0, err
}
driverMinor, err = strconv.Atoi(match[2])
if err != nil {
return 0, 0, err
}
return strings.TrimSpace(string(verString)), nil
return driverMajor, driverMinor, nil
}
66 changes: 19 additions & 47 deletions gpu/amd_windows.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ import (
"os"
"path/filepath"
"slices"
"strconv"
"strings"

"github.com/ollama/ollama/format"
Expand All @@ -34,13 +33,12 @@ func AMDGetGPUInfo() []GpuInfo {
}
defer hl.Release()

ver, err := hl.AMDDriverVersion()
if err == nil {
slog.Info("AMD Driver: " + ver)
} else {
// For now this is benign, but we may eventually need to fail compatibility checks
slog.Debug("error looking up amd driver version", "error", err)
}
// TODO - this reports incorrect version information, so omitting for now
// driverMajor, driverMinor, err := hl.AMDDriverVersion()
// if err != nil {
// // For now this is benign, but we may eventually need to fail compatibility checks
// slog.Debug("error looking up amd driver version", "error", err)
// }

// Note: the HIP library automatically handles subsetting to any HIP_VISIBLE_DEVICES the user specified
count := hl.HipGetDeviceCount()
Expand All @@ -62,10 +60,10 @@ func AMDGetGPUInfo() []GpuInfo {
return nil
}
} else {
slog.Debug("skipping rocm gfx compatibility check with HSA_OVERRIDE_GFX_VERSION=" + gfxOverride)
slog.Info("skipping rocm gfx compatibility check", "HSA_OVERRIDE_GFX_VERSION", gfxOverride)
}

slog.Info("detected hip devices", "count", count)
slog.Debug("detected hip devices", "count", count)
// TODO how to determine the underlying device ID when visible devices is causing this to subset?
for i := 0; i < count; i++ {
err = hl.HipSetDevice(i)
Expand All @@ -85,18 +83,11 @@ func AMDGetGPUInfo() []GpuInfo {
// Can luid be used on windows for setting visible devices (and is it actually set?)
n = bytes.IndexByte(props.GcnArchName[:], 0)
gfx := string(props.GcnArchName[:n])
slog.Info("hip device", "id", i, "name", name, "gfx", gfx)
var major, minor, patch string
switch len(gfx) {
case 6:
major, minor, patch = gfx[3:4], gfx[4:5], gfx[5:]
case 7:
major, minor, patch = gfx[3:5], gfx[5:6], gfx[6:]
}
slog.Debug("hip device", "id", i, "name", name, "gfx", gfx)
//slog.Info(fmt.Sprintf("[%d] Integrated: %d", i, props.iGPU)) // DOESN'T REPORT CORRECTLY! Always 0
// TODO Why isn't props.iGPU accurate!?
if strings.EqualFold(name, iGPUName) {
slog.Info("iGPU detected skipping", "id", i)
slog.Info("unsupported Radeon iGPU detected skipping", "id", i, "name", name, "gfx", gfx)
continue
}
if gfxOverride == "" {
Expand All @@ -106,7 +97,7 @@ func AMDGetGPUInfo() []GpuInfo {
slog.Warn("See https://github.com/ollama/ollama/blob/main/docs/troubleshooting.md for HSA_OVERRIDE_GFX_VERSION usage")
continue
} else {
slog.Info("amdgpu is supported", "gpu", i, "gpu_type", gfx)
slog.Debug("amdgpu is supported", "gpu", i, "gpu_type", gfx)
}
}

Expand All @@ -124,8 +115,8 @@ func AMDGetGPUInfo() []GpuInfo {

// TODO revisit this once ROCm v6 is available on windows.
// v5.7 only reports VRAM used by this process, so it's completely wrong and unusable
slog.Info("amdgpu memory", "gpu", i, "total", format.HumanBytes2(totalMemory))
slog.Info("amdgpu memory", "gpu", i, "available", format.HumanBytes2(freeMemory))
slog.Debug("amdgpu memory", "gpu", i, "total", format.HumanBytes2(totalMemory))
slog.Debug("amdgpu memory", "gpu", i, "available", format.HumanBytes2(freeMemory))
gpuInfo := GpuInfo{
Library: "rocm",
memInfo: memInfo{
Expand All @@ -135,31 +126,12 @@ func AMDGetGPUInfo() []GpuInfo {
ID: fmt.Sprintf("%d", i), // TODO this is probably wrong if we specify visible devices
DependencyPath: libDir,
MinimumMemory: rocmMinimumMemory,
}
if major != "" {
gpuInfo.Major, err = strconv.Atoi(major)
if err != nil {
slog.Info("failed to parse version", "version", gfx, "error", err)
}
}
if minor != "" {
gpuInfo.Minor, err = strconv.Atoi(minor)
if err != nil {
slog.Info("failed to parse version", "version", gfx, "error", err)
}
}
if patch != "" {
// Patch rev is hex; e.g. gfx90a
p, err := strconv.ParseInt(patch, 16, 0)
if err != nil {
slog.Info("failed to parse version", "version", gfx, "error", err)
} else {
gpuInfo.Patch = int(p)
}
}
if gpuInfo.Major < RocmComputeMin {
slog.Warn(fmt.Sprintf("amdgpu [%s] too old gfx%d%d%x", gpuInfo.ID, gpuInfo.Major, gpuInfo.Minor, gpuInfo.Patch))
continue
Name: name,
Compute: gfx,

// TODO - this information isn't accurate on windows, so don't report it until we find the right way to retrieve
// DriverMajor: driverMajor,
// DriverMinor: driverMinor,
}

resp = append(resp, gpuInfo)
Expand Down
16 changes: 11 additions & 5 deletions gpu/gpu.go
Original file line number Diff line number Diff line change
Expand Up @@ -119,12 +119,12 @@ func initGPUHandles() *handles {
return gpuHandles
}

slog.Info("Detecting GPUs")
slog.Debug("Detecting GPUs")
nvcudaLibPaths := FindGPULibs(nvcudaMgmtName, nvcudaMgmtPatterns)
if len(nvcudaLibPaths) > 0 {
deviceCount, nvcuda, libPath := LoadNVCUDAMgmt(nvcudaLibPaths)
if nvcuda != nil {
slog.Info("detected GPUs", "count", deviceCount, "library", libPath)
slog.Debug("detected GPUs", "count", deviceCount, "library", libPath)
gpuHandles.nvcuda = nvcuda
gpuHandles.deviceCount = deviceCount
return gpuHandles
Expand All @@ -135,7 +135,7 @@ func initGPUHandles() *handles {
if len(cudartLibPaths) > 0 {
deviceCount, cudart, libPath := LoadCUDARTMgmt(cudartLibPaths)
if cudart != nil {
slog.Info("detected GPUs", "library", libPath, "count", deviceCount)
slog.Debug("detected GPUs", "library", libPath, "count", deviceCount)
gpuHandles.cudart = cudart
gpuHandles.deviceCount = deviceCount
return gpuHandles
Expand Down Expand Up @@ -184,10 +184,14 @@ func GetGPUInfo() GpuInfoList {
gpuInfo := GpuInfo{
Library: "cuda",
}
var driverMajor int
var driverMinor int
if gpuHandles.cudart != nil {
C.cudart_check_vram(*gpuHandles.cudart, C.int(i), &memInfo)
} else {
C.nvcuda_check_vram(*gpuHandles.nvcuda, C.int(i), &memInfo)
driverMajor = int(gpuHandles.nvcuda.driver_major)
driverMinor = int(gpuHandles.nvcuda.driver_minor)
}
if memInfo.err != nil {
slog.Info("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
Expand All @@ -201,10 +205,12 @@ func GetGPUInfo() GpuInfoList {
gpuInfo.TotalMemory = uint64(memInfo.total)
gpuInfo.FreeMemory = uint64(memInfo.free)
gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
gpuInfo.Major = int(memInfo.major)
gpuInfo.Minor = int(memInfo.minor)
gpuInfo.Compute = fmt.Sprintf("%d.%d", memInfo.major, memInfo.minor)
gpuInfo.MinimumMemory = cudaMinimumMemory
gpuInfo.DependencyPath = depPath
gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
gpuInfo.DriverMajor = int(driverMajor)
gpuInfo.DriverMinor = int(driverMinor)

// TODO potentially sort on our own algorithm instead of what the underlying GPU library does...
resp = append(resp, gpuInfo)
Expand Down
3 changes: 3 additions & 0 deletions gpu/gpu_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,16 +39,19 @@ extern "C" {
#endif

#define GPU_ID_LEN 64
#define GPU_NAME_LEN 96

typedef struct mem_info {
char *err; // If non-nill, caller responsible for freeing
char gpu_id[GPU_ID_LEN];
char gpu_name[GPU_NAME_LEN];
uint64_t total;
uint64_t free;

// Compute Capability
int major;
int minor;
int patch;
} mem_info_t;

void cpu_check_ram(mem_info_t *resp);
Expand Down

0 comments on commit dc18eee

Please sign in to comment.