Skip to content

Commit 0f1a39f

Browse files
authored
ggml : add AArch64 optimized GEMV and GEMM Q4 kernels (#5780)
* Arm AArch64: optimized GEMV and GEMM kernels for q4_0_q8_0, and q8_0_q8_0 quantization * Arm AArch64: add optimized GEMV and GEMM asm kernels for q4_0_q8_0 quantization and refactor code to address llama.cpp pr#5780 suggestions * Arm AArch64: add optimized GEMV and GEMM asm kernels for q4_0_q8_0 quantization and refactor code to address llama.cpp pr#5780 suggestions * Arm AArch64: add optimized GEMV and GEMM asm kernels for q4_0_q8_0 quantization and refactor code to address llama.cpp pr#5780 suggestions * Arm AArch64: add optimized GEMV and GEMM asm kernels for q4_0_q8_0 quantization and refactor code to address llama.cpp pr#5780 suggestions * Arm AArch64: add copyright claim only to ggml-aarch64.cpp and ggml-aarch64.h files * Arm AArch64: minor code refactoring for rebase * Arm AArch64: minor code refactoring for resolving a build issue with cmake * Arm AArch64: minor code refactoring to split the Q4_0_AARC64 type into three separate types: Q4_0_4_4, Q4_0_4_8, and Q4_0_8_8 * Arm AArch64: minor code change for resolving a build issue with server-windows * retrigger checks * Arm AArch64: minor code changes for rebase * Arm AArch64: minor changes to skip the pr#7433 vec_dot code for arm cpus with SVE VL not equal to 256 bits * Arm AArch64: remove stale LLAMA_QKK_64 from CMakeLists.txt and delete build.zig * Arm AArch64: add reference scalar gemm and gemv, and avoid dynamic memory allocations during quantization for Q4_0_4_4, Q4_0_4_8, and Q4_0_8_8 * Arm AArch64: add multithreaded quantization support for the new types: Q4_0_4_4, Q4_0_4_8, and Q4_0_8_8 * Arm AArch64: minor code refactoring * Arm AArch64: simplify logic for calling gemm and gemv functions in ggml_compute_forward_mul_mat * Arm AArch64: minimize changes in ggml_compute_forward_mul_mat * Arm AArch64: minor code refactoring, and add reference scalar code to quantize routines for new quant types * Arm AArch64: minor code refactoring * Arm AArch64: minor code refactoring * Arm AArch64: minor code refactoring * rebase on the latest master commit 3fd62a6 and adapt to the new directory structure * Arm AArch64: remove a redundant comment * Arm AArch64: add pragma in ggml-aarch64.c to turn -Woverlength-strings warning off * Arm AArch64: use __aarch64__ check to guard 64-bit neon kernels * Arm AArch64: update docs/build.md README to include compile time flags for buiilding the Q4_0_4_4 quant type
1 parent 83321c6 commit 0f1a39f

File tree

14 files changed

+2534
-53
lines changed

14 files changed

+2534
-53
lines changed

Makefile

+9-1
Original file line numberDiff line numberDiff line change
@@ -835,7 +835,8 @@ OBJ_GGML += \
835835
ggml/src/ggml.o \
836836
ggml/src/ggml-alloc.o \
837837
ggml/src/ggml-backend.o \
838-
ggml/src/ggml-quants.o
838+
ggml/src/ggml-quants.o \
839+
ggml/src/ggml-aarch64.o
839840

840841
OBJ_LLAMA = \
841842
src/llama.o \
@@ -969,6 +970,13 @@ ggml/src/ggml-quants.o: \
969970
ggml/src/ggml-common.h
970971
$(CC) $(CFLAGS) -c $< -o $@
971972

973+
ggml/src/ggml-aarch64.o: \
974+
ggml/src/ggml-aarch64.c \
975+
ggml/include/ggml.h \
976+
ggml/src/ggml-aarch64.h \
977+
ggml/src/ggml-common.h
978+
$(CC) $(CFLAGS) -c $< -o $@
979+
972980
ggml/src/ggml-blas.o: \
973981
ggml/src/ggml-blas.cpp \
974982
ggml/include/ggml-blas.h

Package.swift

+1
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ var sources = [
1010
"ggml/src/ggml-alloc.c",
1111
"ggml/src/ggml-backend.c",
1212
"ggml/src/ggml-quants.c",
13+
"ggml/src/ggml-aarch64.c",
1314
]
1415

1516
var resources: [Resource] = []

docs/build.md

+2
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ In order to build llama.cpp you have four different options.
2828
```
2929

3030
- Notes:
31+
- For `Q4_0_4_4` quantization type build, add the `GGML_NO_LLAMAFILE=1` flag. For example, use `make GGML_NO_LLAMAFILE=1`.
3132
- For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `make -j 8` will run 8 jobs in parallel.
3233
- For faster repeated compilation, install [ccache](https://ccache.dev/).
3334
- For debug builds, run `make LLAMA_DEBUG=1`
@@ -41,6 +42,7 @@ In order to build llama.cpp you have four different options.
4142

4243
**Notes**:
4344

45+
- For `Q4_0_4_4` quantization type build, add the `-DGGML_LLAMAFILE=OFF` cmake option. For example, use `cmake -B build -DGGML_LLAMAFILE=OFF`.
4446
- For faster compilation, add the `-j` argument to run multiple jobs in parallel. For example, `cmake --build build --config Release -j 8` will run 8 jobs in parallel.
4547
- For faster repeated compilation, install [ccache](https://ccache.dev/).
4648
- For debug builds, there are two cases:

examples/quantize/quantize.cpp

+3
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,9 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
4646
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 5.33G, +0.0569 ppl @ Llama-3-8B", },
4747
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 6.14G, +0.0217 ppl @ Llama-3-8B", },
4848
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 7.96G, +0.0026 ppl @ Llama-3-8B", },
49+
{ "Q4_0_4_4", LLAMA_FTYPE_MOSTLY_Q4_0_4_4, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
50+
{ "Q4_0_4_8", LLAMA_FTYPE_MOSTLY_Q4_0_4_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
51+
{ "Q4_0_8_8", LLAMA_FTYPE_MOSTLY_Q4_0_8_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
4952
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, +0.0020 ppl @ Mistral-7B", },
5053
{ "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", },
5154
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },

ggml/include/ggml.h

+17
Original file line numberDiff line numberDiff line change
@@ -383,6 +383,9 @@ extern "C" {
383383
GGML_TYPE_F64 = 28,
384384
GGML_TYPE_IQ1_M = 29,
385385
GGML_TYPE_BF16 = 30,
386+
GGML_TYPE_Q4_0_4_4 = 31,
387+
GGML_TYPE_Q4_0_4_8 = 32,
388+
GGML_TYPE_Q4_0_8_8 = 33,
386389
GGML_TYPE_COUNT,
387390
};
388391

@@ -424,6 +427,9 @@ extern "C" {
424427
GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
425428
GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
426429
GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
430+
GGML_FTYPE_MOSTLY_Q4_0_4_4 = 25, // except 1d tensors
431+
GGML_FTYPE_MOSTLY_Q4_0_4_8 = 26, // except 1d tensors
432+
GGML_FTYPE_MOSTLY_Q4_0_8_8 = 27, // except 1d tensors
427433
};
428434

429435
// available tensor operations:
@@ -2406,6 +2412,12 @@ extern "C" {
24062412
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
24072413
typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
24082414
const void * GGML_RESTRICT y, size_t by, int nrc);
2415+
typedef void (*ggml_from_float_to_mat_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nr,
2416+
int64_t k, int64_t bx);
2417+
typedef void (*ggml_gemv_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
2418+
const void * GGML_RESTRICT y, int nr, int nc);
2419+
typedef void (*ggml_gemm_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
2420+
const void * GGML_RESTRICT y, int nr, int nc);
24092421

24102422
typedef struct {
24112423
const char * type_name;
@@ -2418,6 +2430,11 @@ extern "C" {
24182430
ggml_vec_dot_t vec_dot;
24192431
enum ggml_type vec_dot_type;
24202432
int64_t nrows; // number of rows to process simultaneously;
2433+
int64_t ncols; // number of columns to process simultaneously;
2434+
int64_t interleave_blcksize; // interleave elements in blocks of interleave_blcksize;
2435+
ggml_from_float_to_mat_t from_float_to_mat;
2436+
ggml_gemv_t gemv;
2437+
ggml_gemm_t gemm;
24212438
} ggml_type_traits_t;
24222439

24232440
GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);

ggml/src/CMakeLists.txt

+1
Original file line numberDiff line numberDiff line change
@@ -1153,6 +1153,7 @@ add_library(ggml
11531153
${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM}
11541154
${GGML_SOURCES_BLAS} ${GGML_HEADERS_BLAS}
11551155
${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE}
1156+
ggml-aarch64.c ggml-aarch64.h
11561157
)
11571158

11581159
if (EMSCRIPTEN)

0 commit comments

Comments
 (0)