Skip to content

Commit

Permalink
Faster AVX2 matrix multiplications for legacy quants (#405)
Browse files Browse the repository at this point in the history
* Matrix multiplications for legacy quants
* Very slightly faster Q5 dequantization
* Restore faster AVX512VNNI+AVX512VL performance
  • Loading branch information
ikawrakow committed May 10, 2024
1 parent 30cdd9c commit eaa756d
Show file tree
Hide file tree
Showing 7 changed files with 397 additions and 3 deletions.
3 changes: 2 additions & 1 deletion llamafile/BUILD.mk
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,8 @@ o/$(MODE)/llamafile: \
#

o/$(MODE)/llamafile/sgemm.o: private CXXFLAGS += -Os
o/$(MODE)/llamafile/iqk_mul_mat.o: private TARGET_ARCH += -Xx86_64-mtune=skylake -Xx86_64-mavx2 -Xx86_64-mfma
o/$(MODE)/llamafile/iqk_mul_mat_amd_avx2.o: private TARGET_ARCH += -Xx86_64-mtune=skylake -Xx86_64-mavx2 -Xx86_64-mfma -Xx86_64-mf16c
o/$(MODE)/llamafile/iqk_mul_mat_amd_zen4.o: private TARGET_ARCH += -Xx86_64-mtune=skylake -Xx86_64-mavx2 -Xx86_64-mfma -Xx86_64-mf16c -Xx86_64-mavx512f -Xx86_64-mavx512vl -Xx86_64-mavx512vnni
o/$(MODE)/llamafile/tinyblas_cpu_sgemm_amd_avx.o: private TARGET_ARCH += -Xx86_64-mtune=sandybridge -Xx86_64-mf16c
o/$(MODE)/llamafile/tinyblas_cpu_mixmul_amd_avx.o: private TARGET_ARCH += -Xx86_64-mtune=sandybridge -Xx86_64-mf16c
o/$(MODE)/llamafile/tinyblas_cpu_sgemm_amd_fma.o: private TARGET_ARCH += -Xx86_64-mtune=bdver2 -Xx86_64-mf16c -Xx86_64-mfma
Expand Down

0 comments on commit eaa756d

Please sign in to comment.