Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dynamic CUDA driver loader #1841

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
11 changes: 9 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,8 @@ else()
option(WHISPER_MKL "whisper: use Intel Math Kernel Library (MKL)" OFF)
option(WHISPER_SYCL "whisper: use SYCL" OFF)
option(WHISPER_SYCL_F16 "whisper: use 16 bit floats for sycl calculations" OFF)

option(WHISPER_DYNAMIC_CUDA "whisper: load CUDA dynamically" OFF)
endif()

option(WHISPER_PERF "whisper: enable perf timings" OFF)
Expand Down Expand Up @@ -337,7 +339,7 @@ if (WHISPER_CUDA)

add_compile_definitions(GGML_USE_CUDA)

if (WHISPER_STATIC)
if (WHISPER_STATIC OR WHISPER_DYNAMIC_CUDA)
if (WIN32)
# As of 12.3.1 CUDA Tookit for Windows does not offer a static cublas library
set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
Expand All @@ -348,7 +350,12 @@ if (WHISPER_CUDA)
set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
endif()

set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cuda_driver)
if (WHISPER_DYNAMIC_CUDA)
set(GGML_SOURCES_CUDA ${GGML_SOURCES_CUDA} cuda-loader.c)
else()
set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cuda_driver)
endif()

else()
message(FATAL_ERROR "cuBLAS not found")
endif()
Expand Down
8 changes: 7 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,13 @@ ifdef WHISPER_CUDA

CFLAGS += -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
CXXFLAGS += -DGGML_USE_CUDA -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
ifdef WHISPER_DYNAMIC_CUDA
LDFLAGS += -lcublas_static -lculibos -lcudart_static -lcublasLt_static -lpthread -ldl -lrt
WHISPER_OBJ += cuda-loader.o
else
LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt
endif
LDFLAGS += -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
WHISPER_OBJ += ggml-cuda.o
WHISPER_OBJ += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
NVCC = nvcc
Expand Down
162 changes: 162 additions & 0 deletions cuda-loader.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
#include <stdio.h>

#include <dlfcn.h>

#include <cuda.h>


typedef CUresult (*cuDeviceGet_pt)(CUdevice *device, int ordinal);
typedef CUresult (*cuDeviceGetAttribute_pt)(int *pi, CUdevice_attribute attrib, CUdevice dev);
typedef CUresult (*cuGetErrorString_pt)(CUresult error, const char **pStr);
typedef CUresult (*cuMemGetAllocationGranularity_pt)(size_t *granularity, const CUmemAllocationProp *prop, CUmemAllocationGranularity_flags option);
typedef CUresult (*cuMemCreate_pt)(CUmemGenericAllocationHandle *handle, size_t size, const CUmemAllocationProp *prop, unsigned long long flags);
typedef CUresult (*cuMemAddressReserve_pt)(CUdeviceptr *ptr, size_t size, size_t alignment, CUdeviceptr addr, unsigned long long flags);
typedef CUresult (*cuMemAddressFree_pt)(CUdeviceptr ptr, size_t size);
typedef CUresult (*cuMemMap_pt)(CUdeviceptr ptr, size_t size, size_t offset, CUmemGenericAllocationHandle handle, unsigned long long flags);
typedef CUresult (*cuMemUnmap_pt)(CUdeviceptr ptr, size_t size);
typedef CUresult (*cuMemRelease_pt)(CUmemGenericAllocationHandle handle);
typedef CUresult (*cuMemSetAccess_pt)(CUdeviceptr ptr, size_t size, const CUmemAccessDesc *desc, size_t count);


cuDeviceGet_pt _cuDeviceGet = NULL;
cuDeviceGetAttribute_pt _cuDeviceGetAttribute = NULL;
cuGetErrorString_pt _cuGetErrorString = NULL;
cuMemGetAllocationGranularity_pt _cuMemGetAllocationGranularity = NULL;
cuMemCreate_pt _cuMemCreate = NULL;
cuMemAddressReserve_pt _cuMemAddressReserve = NULL;
cuMemAddressFree_pt _cuMemAddressFree = NULL;
cuMemMap_pt _cuMemMap = NULL;
cuMemUnmap_pt _cuMemUnmap = NULL;
cuMemRelease_pt _cuMemRelease = NULL;
cuMemSetAccess_pt _cuMemSetAccess = NULL;


int load_libcuda(void) {

static void * libcuda = NULL;

if (libcuda == (void*)1)
return 0;

if (libcuda != NULL)
return 1;

libcuda = dlopen("libcuda.so", RTLD_NOW);

if (libcuda == NULL) {
libcuda = dlopen("libcuda.so.1", RTLD_NOW);
}

if (libcuda != NULL) {
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wpedantic"
_cuDeviceGet = (cuDeviceGet_pt)dlsym(libcuda, "cuDeviceGet");
_cuDeviceGetAttribute = (cuDeviceGetAttribute_pt)dlsym(libcuda, "cuDeviceGetAttribute");
_cuGetErrorString = (cuGetErrorString_pt)dlsym(libcuda, "cuGetErrorString");
_cuMemGetAllocationGranularity = (cuMemGetAllocationGranularity_pt)dlsym(libcuda, "cuMemGetAllocationGranularity");
_cuMemCreate = (cuMemCreate_pt)dlsym(libcuda, "cuMemCreate");
_cuMemAddressReserve = (cuMemAddressReserve_pt)dlsym(libcuda, "cuMemAddressReserve");
_cuMemAddressFree = (cuMemAddressFree_pt)dlsym(libcuda, "cuMemAddressFree");
_cuMemMap = (cuMemMap_pt)dlsym(libcuda, "cuMemMap");
_cuMemUnmap = (cuMemUnmap_pt)dlsym(libcuda, "cuMemUnmap");
_cuMemRelease = (cuMemRelease_pt)dlsym(libcuda, "cuMemRelease");
_cuMemSetAccess = (cuMemSetAccess_pt)dlsym(libcuda, "cuMemSetAccess");
#pragma GCC diagnostic pop

return 1;
}

fprintf(stderr, "error: failed to load libcuda.so: %s\n", dlerror());

libcuda = (void*)1; // tried and failed
return 0;
}


CUresult CUDAAPI cuDeviceGet(CUdevice *device, int ordinal) {
if (_cuDeviceGet == NULL && !load_libcuda())
return CUDA_ERROR_SHARED_OBJECT_INIT_FAILED;
if (_cuDeviceGet == NULL)
return CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND;
return _cuDeviceGet(device, ordinal);
}

CUresult CUDAAPI cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev) {
if (_cuDeviceGetAttribute == NULL && !load_libcuda())
return CUDA_ERROR_SHARED_OBJECT_INIT_FAILED;
if (_cuDeviceGetAttribute == NULL)
return CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND;
return _cuDeviceGetAttribute(pi, attrib, dev);
}

CUresult CUDAAPI cuGetErrorString(CUresult error, const char **pStr) {
if (_cuGetErrorString == NULL && !load_libcuda())
return CUDA_ERROR_SHARED_OBJECT_INIT_FAILED;
if (_cuGetErrorString == NULL)
return CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND;
return _cuGetErrorString(error, pStr);
}

CUresult CUDAAPI cuMemGetAllocationGranularity(size_t *granularity, const CUmemAllocationProp *prop, CUmemAllocationGranularity_flags option) {
if (_cuMemGetAllocationGranularity == NULL && !load_libcuda())
return CUDA_ERROR_SHARED_OBJECT_INIT_FAILED;
if (_cuMemGetAllocationGranularity == NULL)
return CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND;
return _cuMemGetAllocationGranularity(granularity, prop, option);
}

CUresult CUDAAPI cuMemCreate(CUmemGenericAllocationHandle *handle, size_t size, const CUmemAllocationProp *prop, unsigned long long flags) {
if (_cuMemCreate == NULL && !load_libcuda())
return CUDA_ERROR_SHARED_OBJECT_INIT_FAILED;
if (_cuMemCreate == NULL)
return CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND;
return _cuMemCreate(handle, size, prop, flags);
}

CUresult CUDAAPI cuMemAddressReserve(CUdeviceptr *ptr, size_t size, size_t alignment, CUdeviceptr addr, unsigned long long flags) {
if (_cuMemAddressReserve == NULL && !load_libcuda())
return CUDA_ERROR_SHARED_OBJECT_INIT_FAILED;
if (_cuMemAddressReserve == NULL)
return CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND;
return _cuMemAddressReserve(ptr, size, alignment, addr, flags);
}

CUresult CUDAAPI cuMemAddressFree(CUdeviceptr ptr, size_t size) {
if (_cuMemAddressFree == NULL && !load_libcuda())
return CUDA_ERROR_SHARED_OBJECT_INIT_FAILED;
if (_cuMemAddressFree == NULL)
return CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND;
return _cuMemAddressFree(ptr, size);
}

CUresult CUDAAPI cuMemMap(CUdeviceptr ptr, size_t size, size_t offset, CUmemGenericAllocationHandle handle, unsigned long long flags) {
if (_cuMemMap == NULL && !load_libcuda())
return CUDA_ERROR_SHARED_OBJECT_INIT_FAILED;
if (_cuMemMap == NULL)
return CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND;
return _cuMemMap(ptr, size, offset, handle, flags);
}

CUresult CUDAAPI cuMemUnmap(CUdeviceptr ptr, size_t size) {
if (_cuMemUnmap == NULL && !load_libcuda())
return CUDA_ERROR_SHARED_OBJECT_INIT_FAILED;
if (_cuMemUnmap == NULL)
return CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND;
return _cuMemUnmap(ptr, size);
}

CUresult CUDAAPI cuMemRelease(CUmemGenericAllocationHandle handle) {
if (_cuMemRelease == NULL && !load_libcuda())
return CUDA_ERROR_SHARED_OBJECT_INIT_FAILED;
if (_cuMemRelease == NULL)
return CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND;
return _cuMemRelease(handle);
}

CUresult CUDAAPI cuMemSetAccess(CUdeviceptr ptr, size_t size, const CUmemAccessDesc *desc, size_t count) {
if (_cuMemSetAccess == NULL && !load_libcuda())
return CUDA_ERROR_SHARED_OBJECT_INIT_FAILED;
if (_cuMemSetAccess == NULL)
return CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND;
return _cuMemSetAccess(ptr, size, desc, count);
}
11 changes: 9 additions & 2 deletions ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,9 @@ inline static void * ggml_calloc(size_t num, size_t size) {
#elif defined(GGML_USE_CLBLAST)
#include "ggml-opencl.h"
#endif
#if defined(GGML_USE_CUDA)
#include "ggml-cuda.h"
#endif

// floating point type used to accumulate sums
typedef double ggml_float;
Expand Down Expand Up @@ -21668,7 +21671,11 @@ int ggml_cpu_has_wasm_simd(void) {
}

int ggml_cpu_has_blas(void) {
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
return 1;
#elif defined(GGML_USE_CUDA)
return ggml_backend_cuda_get_device_count() > 0;
#elif defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
return 1;
#else
return 0;
Expand All @@ -21677,7 +21684,7 @@ int ggml_cpu_has_blas(void) {

int ggml_cpu_has_cuda(void) {
#if defined(GGML_USE_CUDA)
return 1;
return ggml_backend_cuda_get_device_count() > 0;
#else
return 0;
#endif
Expand Down