Skip to content

Commit

Permalink
ggml : add dynamic CUDA driver loader and static link against CUDA ru…
Browse files Browse the repository at this point in the history
…ntime

This approach lets CUDA enabled binaries to run on systems without CUDA
supported GPUs and fall back to alternative computation methods.
  • Loading branch information
didzis committed Apr 16, 2024
1 parent 9fab281 commit 516a409
Show file tree
Hide file tree
Showing 4 changed files with 187 additions and 5 deletions.
11 changes: 9 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,8 @@ else()
option(WHISPER_MKL "whisper: use Intel Math Kernel Library (MKL)" OFF)
option(WHISPER_SYCL "whisper: use SYCL" OFF)
option(WHISPER_SYCL_F16 "whisper: use 16 bit floats for sycl calculations" OFF)

option(WHISPER_DYNAMIC_CUDA "whisper: load CUDA dynamically" OFF)
endif()

option(WHISPER_PERF "whisper: enable perf timings" OFF)
Expand Down Expand Up @@ -337,7 +339,7 @@ if (WHISPER_CUDA)

add_compile_definitions(GGML_USE_CUDA)

if (WHISPER_STATIC)
if (WHISPER_STATIC OR WHISPER_DYNAMIC_CUDA)
if (WIN32)
# As of 12.3.1 CUDA Tookit for Windows does not offer a static cublas library
set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
Expand All @@ -348,7 +350,12 @@ if (WHISPER_CUDA)
set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
endif()

set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cuda_driver)
if (WHISPER_DYNAMIC_CUDA)
set(GGML_SOURCES_CUDA ${GGML_SOURCES_CUDA} cuda-loader.c)
else()
set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cuda_driver)
endif()

else()
message(FATAL_ERROR "cuBLAS not found")
endif()
Expand Down
8 changes: 7 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -266,7 +266,13 @@ ifdef WHISPER_CUDA

CFLAGS += -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
CXXFLAGS += -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
ifdef WHISPER_DYNAMIC_CUDA
LDFLAGS += -lcublas_static -lculibos -lcudart_static -lcublasLt_static -lpthread -ldl -lrt
WHISPER_OBJ += cuda-loader.o
else
LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt
endif
LDFLAGS += -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
WHISPER_OBJ += ggml-cuda.o
WHISPER_OBJ += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
NVCC = nvcc
Expand Down
162 changes: 162 additions & 0 deletions cuda-loader.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
#include <stdio.h>

#include <dlfcn.h>

#include <cuda.h>


typedef CUresult (*cuDeviceGet_pt)(CUdevice *device, int ordinal);
typedef CUresult (*cuDeviceGetAttribute_pt)(int *pi, CUdevice_attribute attrib, CUdevice dev);
typedef CUresult (*cuGetErrorString_pt)(CUresult error, const char **pStr);
typedef CUresult (*cuMemGetAllocationGranularity_pt)(size_t *granularity, const CUmemAllocationProp *prop, CUmemAllocationGranularity_flags option);
typedef CUresult (*cuMemCreate_pt)(CUmemGenericAllocationHandle *handle, size_t size, const CUmemAllocationProp *prop, unsigned long long flags);
typedef CUresult (*cuMemAddressReserve_pt)(CUdeviceptr *ptr, size_t size, size_t alignment, CUdeviceptr addr, unsigned long long flags);
typedef CUresult (*cuMemAddressFree_pt)(CUdeviceptr ptr, size_t size);
typedef CUresult (*cuMemMap_pt)(CUdeviceptr ptr, size_t size, size_t offset, CUmemGenericAllocationHandle handle, unsigned long long flags);
typedef CUresult (*cuMemUnmap_pt)(CUdeviceptr ptr, size_t size);
typedef CUresult (*cuMemRelease_pt)(CUmemGenericAllocationHandle handle);
typedef CUresult (*cuMemSetAccess_pt)(CUdeviceptr ptr, size_t size, const CUmemAccessDesc *desc, size_t count);


cuDeviceGet_pt _cuDeviceGet = NULL;
cuDeviceGetAttribute_pt _cuDeviceGetAttribute = NULL;
cuGetErrorString_pt _cuGetErrorString = NULL;
cuMemGetAllocationGranularity_pt _cuMemGetAllocationGranularity = NULL;
cuMemCreate_pt _cuMemCreate = NULL;
cuMemAddressReserve_pt _cuMemAddressReserve = NULL;
cuMemAddressFree_pt _cuMemAddressFree = NULL;
cuMemMap_pt _cuMemMap = NULL;
cuMemUnmap_pt _cuMemUnmap = NULL;
cuMemRelease_pt _cuMemRelease = NULL;
cuMemSetAccess_pt _cuMemSetAccess = NULL;


int load_libcuda(void) {

static void * libcuda = NULL;

if (libcuda == (void*)1)
return 0;

if (libcuda != NULL)
return 1;

libcuda = dlopen("libcuda.so", RTLD_NOW);

if (libcuda == NULL) {
libcuda = dlopen("libcuda.so.1", RTLD_NOW);
}

if (libcuda != NULL) {
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wpedantic"
_cuDeviceGet = (cuDeviceGet_pt)dlsym(libcuda, "cuDeviceGet");
_cuDeviceGetAttribute = (cuDeviceGetAttribute_pt)dlsym(libcuda, "cuDeviceGetAttribute");
_cuGetErrorString = (cuGetErrorString_pt)dlsym(libcuda, "cuGetErrorString");
_cuMemGetAllocationGranularity = (cuMemGetAllocationGranularity_pt)dlsym(libcuda, "cuMemGetAllocationGranularity");
_cuMemCreate = (cuMemCreate_pt)dlsym(libcuda, "cuMemCreate");
_cuMemAddressReserve = (cuMemAddressReserve_pt)dlsym(libcuda, "cuMemAddressReserve");
_cuMemAddressFree = (cuMemAddressFree_pt)dlsym(libcuda, "cuMemAddressFree");
_cuMemMap = (cuMemMap_pt)dlsym(libcuda, "cuMemMap");
_cuMemUnmap = (cuMemUnmap_pt)dlsym(libcuda, "cuMemUnmap");
_cuMemRelease = (cuMemRelease_pt)dlsym(libcuda, "cuMemRelease");
_cuMemSetAccess = (cuMemSetAccess_pt)dlsym(libcuda, "cuMemSetAccess");
#pragma GCC diagnostic pop

return 1;
}

fprintf(stderr, "error: failed to load libcuda.so: %s\n", dlerror());

libcuda = (void*)1; // tried and failed
return 0;
}


CUresult CUDAAPI cuDeviceGet(CUdevice *device, int ordinal) {
if (_cuDeviceGet == NULL && !load_libcuda())
return CUDA_ERROR_SHARED_OBJECT_INIT_FAILED;
if (_cuDeviceGet == NULL)
return CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND;
return _cuDeviceGet(device, ordinal);
}

CUresult CUDAAPI cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev) {
if (_cuDeviceGetAttribute == NULL && !load_libcuda())
return CUDA_ERROR_SHARED_OBJECT_INIT_FAILED;
if (_cuDeviceGetAttribute == NULL)
return CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND;
return _cuDeviceGetAttribute(pi, attrib, dev);
}

CUresult CUDAAPI cuGetErrorString(CUresult error, const char **pStr) {
if (_cuGetErrorString == NULL && !load_libcuda())
return CUDA_ERROR_SHARED_OBJECT_INIT_FAILED;
if (_cuGetErrorString == NULL)
return CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND;
return _cuGetErrorString(error, pStr);
}

CUresult CUDAAPI cuMemGetAllocationGranularity(size_t *granularity, const CUmemAllocationProp *prop, CUmemAllocationGranularity_flags option) {
if (_cuMemGetAllocationGranularity == NULL && !load_libcuda())
return CUDA_ERROR_SHARED_OBJECT_INIT_FAILED;
if (_cuMemGetAllocationGranularity == NULL)
return CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND;
return _cuMemGetAllocationGranularity(granularity, prop, option);
}

CUresult CUDAAPI cuMemCreate(CUmemGenericAllocationHandle *handle, size_t size, const CUmemAllocationProp *prop, unsigned long long flags) {
if (_cuMemCreate == NULL && !load_libcuda())
return CUDA_ERROR_SHARED_OBJECT_INIT_FAILED;
if (_cuMemCreate == NULL)
return CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND;
return _cuMemCreate(handle, size, prop, flags);
}

CUresult CUDAAPI cuMemAddressReserve(CUdeviceptr *ptr, size_t size, size_t alignment, CUdeviceptr addr, unsigned long long flags) {
if (_cuMemAddressReserve == NULL && !load_libcuda())
return CUDA_ERROR_SHARED_OBJECT_INIT_FAILED;
if (_cuMemAddressReserve == NULL)
return CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND;
return _cuMemAddressReserve(ptr, size, alignment, addr, flags);
}

CUresult CUDAAPI cuMemAddressFree(CUdeviceptr ptr, size_t size) {
if (_cuMemAddressFree == NULL && !load_libcuda())
return CUDA_ERROR_SHARED_OBJECT_INIT_FAILED;
if (_cuMemAddressFree == NULL)
return CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND;
return _cuMemAddressFree(ptr, size);
}

CUresult CUDAAPI cuMemMap(CUdeviceptr ptr, size_t size, size_t offset, CUmemGenericAllocationHandle handle, unsigned long long flags) {
if (_cuMemMap == NULL && !load_libcuda())
return CUDA_ERROR_SHARED_OBJECT_INIT_FAILED;
if (_cuMemMap == NULL)
return CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND;
return _cuMemMap(ptr, size, offset, handle, flags);
}

CUresult CUDAAPI cuMemUnmap(CUdeviceptr ptr, size_t size) {
if (_cuMemUnmap == NULL && !load_libcuda())
return CUDA_ERROR_SHARED_OBJECT_INIT_FAILED;
if (_cuMemUnmap == NULL)
return CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND;
return _cuMemUnmap(ptr, size);
}

CUresult CUDAAPI cuMemRelease(CUmemGenericAllocationHandle handle) {
if (_cuMemRelease == NULL && !load_libcuda())
return CUDA_ERROR_SHARED_OBJECT_INIT_FAILED;
if (_cuMemRelease == NULL)
return CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND;
return _cuMemRelease(handle);
}

CUresult CUDAAPI cuMemSetAccess(CUdeviceptr ptr, size_t size, const CUmemAccessDesc *desc, size_t count) {
if (_cuMemSetAccess == NULL && !load_libcuda())
return CUDA_ERROR_SHARED_OBJECT_INIT_FAILED;
if (_cuMemSetAccess == NULL)
return CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND;
return _cuMemSetAccess(ptr, size, desc, count);
}
11 changes: 9 additions & 2 deletions ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,9 @@ inline static void * ggml_calloc(size_t num, size_t size) {
#elif defined(GGML_USE_CLBLAST)
#include "ggml-opencl.h"
#endif
#if defined(GGML_USE_CUDA)
#include "ggml-cuda.h"
#endif

// floating point type used to accumulate sums
typedef double ggml_float;
Expand Down Expand Up @@ -21668,7 +21671,11 @@ int ggml_cpu_has_wasm_simd(void) {
}

int ggml_cpu_has_blas(void) {
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
return 1;
#elif defined(GGML_USE_CUDA)
return ggml_backend_cuda_get_device_count() > 0;
#elif defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
return 1;
#else
return 0;
Expand All @@ -21677,7 +21684,7 @@ int ggml_cpu_has_blas(void) {

int ggml_cpu_has_cuda(void) {
#if defined(GGML_USE_CUDA)
return 1;
return ggml_backend_cuda_get_device_count() > 0;
#else
return 0;
#endif
Expand Down

0 comments on commit 516a409

Please sign in to comment.