Skip to content

Commit

Permalink
Release llamafile v0.8.2
Browse files Browse the repository at this point in the history
- Upgrade to cosmocc 3.3.6
- Remove warnings from cuda build
- Fix bug in llamafile_trapping_enabled
- Refactor the new vectorized expf() code
- iqk_mul_mat() only needs codegen for AVX2
- Be less gung ho about the -ngl flag in README
- Restore shell scriptabiilty fix for new tokenizer
- Suppress divide by zero errors llama_print_timings()
- Cut back on tinyBLAS CPU multiple output type kernels
- Cut back NVIDIA fat binary releases to -arch=all-major
- Remove GA (won't rely on slow broken irregular cloud dev tools)
- Cut flash_attn_ext from release binaries (use --recompile to have it)
  • Loading branch information
jart committed May 9, 2024
1 parent 564d9fb commit 4ee1e39
Show file tree
Hide file tree
Showing 24 changed files with 242 additions and 467 deletions.
42 changes: 0 additions & 42 deletions .github/workflows/ci.yml

This file was deleted.

12 changes: 6 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ chmod +x llava-v1.5-7b-q4.llamafile
5. Run the llamafile. e.g.:

```sh
./llava-v1.5-7b-q4.llamafile -ngl 9999
./llava-v1.5-7b-q4.llamafile
```

6. Your browser should open automatically and display a chat interface.
Expand Down Expand Up @@ -184,19 +184,19 @@ try out llamafile with different kinds of LLMs.
Here is an example for the Mistral command-line llamafile:

```sh
./mistral-7b-instruct-v0.2.Q5_K_M.llamafile -ngl 9999 --temp 0.7 -p '[INST]Write a story about llamas[/INST]'
./mistral-7b-instruct-v0.2.Q5_K_M.llamafile --temp 0.7 -p '[INST]Write a story about llamas[/INST]'
```

And here is an example for WizardCoder-Python command-line llamafile:

```sh
./wizardcoder-python-13b.llamafile -ngl 9999 --temp 0 -e -r '```\n' -p '```c\nvoid *memcpy_sse2(char *dst, const char *src, size_t size) {\n'
./wizardcoder-python-13b.llamafile --temp 0 -e -r '```\n' -p '```c\nvoid *memcpy_sse2(char *dst, const char *src, size_t size) {\n'
```

And here's an example for the LLaVA command-line llamafile:

```sh
./llava-v1.5-7b-q4.llamafile -ngl 9999 --temp 0.2 --image lemurs.jpg -e -p '### User: What do you see?\n### Assistant:'
./llava-v1.5-7b-q4.llamafile --temp 0.2 --image lemurs.jpg -e -p '### User: What do you see?\n### Assistant:'
```

As before, macOS, Linux, and BSD users will need to use the "chmod"
Expand Down Expand Up @@ -266,7 +266,7 @@ For Windows users, here's an example for the Mistral LLM:
```sh
curl -L -o llamafile.exe https://github.com/Mozilla-Ocho/llamafile/releases/download/0.6/llamafile-0.6
curl -L -o mistral.gguf https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF/resolve/main/mistral-7b-instruct-v0.1.Q4_K_M.gguf
./llamafile.exe -m mistral.gguf -ngl 9999
./llamafile.exe -m mistral.gguf
```

Windows users may need to change `./llamafile.exe` to `.\llamafile.exe`
Expand Down Expand Up @@ -423,7 +423,7 @@ llama.cpp command line interface, utilizing WizardCoder-Python-13B
weights:

```sh
llamafile -ngl 9999 \
llamafile \
-m wizardcoder-python-13b-v1.0.Q8_0.gguf \
--temp 0 -r '}\n' -r '```\n' \
-e -p '```c\nvoid *memcpy(void *dst, const void *src, size_t size) {\n'
Expand Down
7 changes: 4 additions & 3 deletions build/config.mk
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
#── vi: set noet ft=make ts=8 sw=8 fenc=utf-8 :vi ────────────────────┘

PREFIX = /usr/local
COSMOCC = .cosmocc/3.3.5
COSMOCC = .cosmocc/3.3.6
TOOLCHAIN = $(COSMOCC)/bin/cosmo

AR = $(TOOLCHAIN)ar
Expand Down Expand Up @@ -50,6 +50,7 @@ clean:; rm -rf o
.PHONY: distclean
distclean:; rm -rf o .cosmocc

.cosmocc/3.3.5:
build/download-cosmocc.sh $@ 3.3.5 db78fd8d3f8706e9dff4be72bf71d37a3f12062f212f407e1c33bc4af3780dd0
.cosmocc/3.3.6:
build/download-cosmocc.sh $@ 3.3.6 26e3449357f31b82489774ef5c2d502a711bb711d4faf99a5fd6c96328a1c205


39 changes: 29 additions & 10 deletions llama.cpp/ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,6 @@
#include "ggml-backend-impl.h"

static const struct ggml_backend_api *g_backend;
#define exit g_backend->exit
#define getenv g_backend->getenv
#define FLAG_log_disable (*g_backend->FLAG_log_disable)
#define ggml_backend_register g_backend->ggml_backend_register
Expand Down Expand Up @@ -242,6 +241,18 @@ static const struct ggml_backend_api *g_backend;
#define ggml_is_empty g_backend->ggml_is_empty
#define ggml_op_desc g_backend->ggml_op_desc

[[noreturn]]
static void exit_(int rc) {
g_backend->exit(rc);
#define exit exit_
#if defined(__GNUC__) || defined(__llvm__)
__builtin_unreachable();
#elif defined(_MSC_VER)
__assume(0);
#endif
for (;;);
}

// printf() and fprintf() runtime bridge
// this is needed so text gets printed on windows
// it also helps ensure the atomicity of log lines
Expand Down Expand Up @@ -484,6 +495,14 @@ static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const hal
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
}

#define FP16_AVAILABLE (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) ? \
defined(RDNA1) || defined(RDNA2) || defined(RDNA3) : __CUDA_ARCH__ >= CC_PASCAL)
#define FP16_MMA_AVAILABLE (!(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA)
#if FP16_MMA_AVAILABLE
#include <mma.h>
#endif

#if defined(GGML_MINIMIZE_CODE_SIZE) && FP16_AVAILABLE // [jart]
static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
#pragma unroll
Expand All @@ -496,6 +515,7 @@ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
NO_DEVICE_CODE;
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
}
#endif // [jart]

#if CUDART_VERSION < CUDART_HMASK
static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half2 b) {
Expand Down Expand Up @@ -588,15 +608,6 @@ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
}
#endif // defined(GGML_USE_HIPBLAS)

#define FP16_AVAILABLE defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) ? \
defined(RDNA1) || defined(RDNA2) || defined(RDNA3) : __CUDA_ARCH__ >= CC_PASCAL

#define FP16_MMA_AVAILABLE !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA

#if FP16_MMA_AVAILABLE
#include <mma.h>
#endif

// TODO: move to ggml-common.h
static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};

Expand Down Expand Up @@ -823,7 +834,9 @@ void ggml_cuda_op_dequantize_mul_mat_vec(
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
const int64_t src1_padded_row_size, cudaStream_t stream);

#ifndef GGML_MINIMIZE_CODE_SIZE // [jart]
void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
#endif

#define CUDA_GET_ROWS_BLOCK_SIZE 256

Expand Down Expand Up @@ -5785,6 +5798,7 @@ template <int D, int cols_per_block, int nwarps, typename KQ_acc_t> void launch_
launch_fattn_f16_impl<D, cols_per_block, nwarps, 1, KQ_acc_t>(Q, K, V, KQV, mask, pool, main_stream);
}

#ifndef GGML_MINIMIZE_CODE_SIZE // [jart]
void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * Q = dst->src[0];
const ggml_tensor * K = dst->src[1];
Expand Down Expand Up @@ -5966,6 +5980,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
}
return;
}
#endif // GGML_MINIMIZE_CODE_SIZE [jart]

template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
static __global__ void k_get_rows(
Expand Down Expand Up @@ -12501,9 +12516,11 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_ARGSORT:
ggml_cuda_op_argsort(ctx, dst);
break;
#ifndef GGML_MINIMIZE_CODE_SIZE // [jart]
case GGML_OP_FLASH_ATTN_EXT:
ggml_cuda_flash_attn_ext(ctx, dst);
break;
#endif
default:
return false;
}
Expand Down Expand Up @@ -12778,7 +12795,9 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
case GGML_OP_ARANGE:
case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_LEAKY_RELU:
#ifndef GGML_MINIMIZE_CODE_SIZE // [jart]
case GGML_OP_FLASH_ATTN_EXT:
#endif
return true;
default:
return false;
Expand Down

0 comments on commit 4ee1e39

Please sign in to comment.