Skip to content

Commit a0a4646

Browse files
committed
fixes
1 parent 02f8cdf commit a0a4646

File tree

9 files changed

+113
-101
lines changed

9 files changed

+113
-101
lines changed

Makefile

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -874,9 +874,8 @@ ggml/src/ggml-cuda/%.o: \
874874
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
875875
endif # GGML_HIPBLAS
876876

877-
ifdef GGML_CPU_AARCH64
877+
ifndef GGML_NO_CPU_AARCH64
878878
MK_CPPFLAGS += -DGGML_USE_CPU_AARCH64
879-
MK_CFLAGS += -DGGML_USE_CPU_AARCH64
880879
endif
881880

882881
ifdef GGML_METAL
@@ -888,7 +887,7 @@ ifdef GGML_METAL_NDEBUG
888887
endif
889888
ifdef GGML_METAL_EMBED_LIBRARY
890889
MK_CPPFLAGS += -DGGML_METAL_EMBED_LIBRARY
891-
OBJ_GGML += ggml/src/ggml-metal-embed.o
890+
OBJ_GGML += ggml/src/ggml-metal-embed.o
892891
endif
893892
endif # GGML_METAL
894893

ggml/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ else()
9292
endif()
9393

9494
option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF)
95-
option(GGML_CPU_AARCH64 "ggml: use runtime weight quantization to enable optimized GEMM/GEMV kernels for AARCH64 cpu" OFF)
95+
option(GGML_CPU_AARCH64 "ggml: use runtime weight conversionn of Q4_0 to Q4_X_X" ON)
9696

9797
option(GGML_AVX "ggml: enable AVX" ${INS_ENB})
9898
option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB})

ggml/include/ggml-cpu.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -145,9 +145,10 @@ extern "C" {
145145
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
146146
#endif
147147

148-
#ifdef GGML_USE_CPU_AARCH64
149148
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void);
150-
#endif
149+
GGML_API bool ggml_backend_cpu_buft_is_aarch64(ggml_backend_buffer_type_t buft);
150+
151+
151152

152153
#ifdef __cplusplus
153154
}

ggml/src/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -881,7 +881,7 @@ if (GGML_CPU_HBM)
881881
endif()
882882
883883
if (GGML_CPU_AARCH64)
884-
message(STATUS "Using runtime weight quantization to enable optimized GEMM/GEMV kernels for AARCH64 cpu")
884+
message(STATUS "Using runtime weight conversion of Q4_0 to Q4_0_x_x to enable optimized GEMM/GEMV kernels")
885885
886886
add_compile_definitions(GGML_USE_CPU_AARCH64)
887887
endif()

ggml/src/ggml-aarch64.c

Lines changed: 32 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -3477,14 +3477,13 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
34773477
}
34783478
}
34793479

3480-
#ifdef GGML_USE_CPU_AARCH64
3481-
static void repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block, const void * data, size_t data_size) {
3480+
static void repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_block, const void * restrict data, size_t data_size) {
34823481
GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
34833482
GGML_ASSERT(t->ne[0] % 8 == 0);
34843483
GGML_ASSERT(interleave_block == 4 || interleave_block == 8);
34853484

3486-
block_q4_0x4 *dst = (block_q4_0x4 *)t->data;
3487-
const block_q4_0 *src = (const block_q4_0 *)data;
3485+
block_q4_0x4 * dst = (block_q4_0x4 *)t->data;
3486+
const block_q4_0 * src = (const block_q4_0 *)data;
34883487
block_q4_0 dst_tmp[4];
34893488
int nrow = t->ne[1]; // Number of rows
34903489
int nrows_interleaved = 4;
@@ -3493,8 +3492,7 @@ static void repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_bloc
34933492
GGML_ASSERT(data_size == nrow * nblocks * sizeof(block_q4_0));
34943493

34953494
for (int b = 0; b < nrow; b += nrows_interleaved) {
3496-
for (int64_t x = 0; x < nblocks; x++)
3497-
{
3495+
for (int64_t x = 0; x < nblocks; x++) {
34983496
for (int i = 0; i < nrows_interleaved; i++) {
34993497
dst_tmp[i] = src[x + i * nblocks];
35003498
}
@@ -3506,13 +3504,13 @@ static void repack_q4_0_to_q4_0_4_bl(struct ggml_tensor * t, int interleave_bloc
35063504
GGML_UNUSED(data_size);
35073505
}
35083506

3509-
static void repack_q4_0_to_q4_0_8_bl(struct ggml_tensor *t, int interleave_block, const void * data, size_t data_size) {
3507+
static void repack_q4_0_to_q4_0_8_bl(struct ggml_tensor * t, int interleave_block, const void * restrict data, size_t data_size) {
35103508
GGML_ASSERT(t->type == GGML_TYPE_Q4_0);
35113509
GGML_ASSERT(t->ne[0] % 8 == 0);
35123510
GGML_ASSERT(interleave_block == 8);
35133511

3514-
block_q4_0x8 *dst = (block_q4_0x8*)t->data;
3515-
const block_q4_0 *src = (const block_q4_0*) data;
3512+
block_q4_0x8 * dst = (block_q4_0x8*)t->data;
3513+
const block_q4_0 * src = (const block_q4_0*) data;
35163514
block_q4_0 dst_tmp[8];
35173515
int nrow = t->ne[1]; // Number of rows
35183516
int nrows_interleaved = 8;
@@ -3534,46 +3532,42 @@ static void repack_q4_0_to_q4_0_8_bl(struct ggml_tensor *t, int interleave_block
35343532
}
35353533

35363534
// Prepare for optimized kernels if applicable
3537-
int ggml_prepare_optimal_kernel(struct ggml_tensor * cur, const void * data, size_t data_size) {
3538-
GGML_ASSERT(cur->type == GGML_TYPE_Q4_0);
3539-
int ret = -1;
3540-
#if defined(__ARM_ARCH)
3541-
if (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0) {
3542-
repack_q4_0_to_q4_0_8_bl(cur, 8, data, data_size);
3543-
ret = 0;
3544-
}
3545-
else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
3546-
repack_q4_0_to_q4_0_4_bl(cur, 8, data, data_size);
3547-
ret = 0;
3548-
}
3549-
else if (ggml_cpu_has_neon()) {
3550-
repack_q4_0_to_q4_0_4_bl(cur, 4, data, data_size);
3551-
ret = 0;
3535+
void ggml_aarch64_repack_tensor(struct ggml_tensor * cur, enum ggml_type repack_type, const void * restrict data, size_t data_size) {
3536+
if (cur->type == repack_type) {
3537+
memcpy(cur->data, data, data_size);
3538+
return;
35523539
}
3553-
#endif
3554-
return ret;
35553540

3556-
GGML_UNUSED(cur);
3557-
GGML_UNUSED(data);
3558-
GGML_UNUSED(data_size);
3541+
GGML_ASSERT(cur->type == GGML_TYPE_Q4_0);
3542+
3543+
switch (repack_type) {
3544+
case GGML_TYPE_Q4_0_8_8:
3545+
repack_q4_0_to_q4_0_8_bl(cur, 8, data, data_size);
3546+
break;
3547+
case GGML_TYPE_Q4_0_4_8:
3548+
repack_q4_0_to_q4_0_4_bl(cur, 8, data, data_size);
3549+
break;
3550+
case GGML_TYPE_Q4_0_4_4:
3551+
repack_q4_0_to_q4_0_4_bl(cur, 4, data, data_size);
3552+
break;
3553+
default:
3554+
GGML_ABORT("Unsupported type");
3555+
}
35593556
}
35603557

3561-
enum ggml_type ggml_get_optimal_type(const struct ggml_tensor * cur) {
3562-
#if defined(__ARM_ARCH)
3558+
enum ggml_type ggml_aarch64_get_optimal_repack_type(const struct ggml_tensor * cur) {
35633559
if (cur->type == GGML_TYPE_Q4_0) {
3564-
if (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0) {
3560+
// TODO: enable for AVX2 - currently disabled due to bad gemv performance
3561+
if (/* ggml_cpu_has_avx2() || */ (ggml_cpu_has_sve() && ggml_cpu_has_matmul_int8() && ggml_cpu_get_sve_cnt() == QK8_0)) {
35653562
return GGML_TYPE_Q4_0_8_8;
35663563
}
3567-
else if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
3564+
if (ggml_cpu_has_neon() && ggml_cpu_has_matmul_int8()) {
35683565
return GGML_TYPE_Q4_0_4_8;
35693566
}
3570-
else if (ggml_cpu_has_neon()) {
3567+
if (ggml_cpu_has_neon()) {
35713568
return GGML_TYPE_Q4_0_4_4;
35723569
}
35733570
}
3574-
#endif
3575-
return cur->type;
35763571

3577-
GGML_UNUSED(cur);
3572+
return cur->type;
35783573
}
3579-
#endif

ggml/src/ggml-aarch64.h

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -33,10 +33,8 @@ void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const vo
3333
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
3434
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
3535

36-
#ifdef GGML_USE_CPU_AARCH64
37-
int ggml_prepare_optimal_kernel(struct ggml_tensor * cur, const void * data, size_t data_size);
38-
enum ggml_type ggml_get_optimal_type(const struct ggml_tensor * cur);
39-
#endif
36+
void ggml_aarch64_repack_tensor(struct ggml_tensor * cur, enum ggml_type repack_type, const void * data, size_t data_size);
37+
enum ggml_type ggml_aarch64_get_optimal_repack_type(const struct ggml_tensor * cur);
4038

4139
#ifdef __cplusplus
4240
}

ggml/src/ggml-backend.cpp

Lines changed: 59 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -2239,36 +2239,44 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
22392239
}
22402240
#endif
22412241

2242-
#ifdef GGML_USE_CPU_AARCH64
2243-
22442242
// buffer type AARCH64
22452243

2244+
#ifdef __GNUC__
2245+
#pragma GCC diagnostic push
2246+
#pragma GCC diagnostic ignored "-Wpedantic"
2247+
#endif
2248+
22462249
#include "ggml-aarch64.h"
22472250

2251+
#ifdef __GNUC__
2252+
#pragma GCC diagnostic pop
2253+
#endif
2254+
2255+
static void ggml_backend_cpu_aarch64_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
2256+
tensor->extra = (void *)ggml_aarch64_get_optimal_repack_type(tensor); // NOLINT
2257+
2258+
GGML_UNUSED(buffer);
2259+
}
2260+
22482261
static void ggml_backend_cpu_aarch64_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
2249-
bool quantize = tensor->type == GGML_TYPE_Q4_0 &&
2250-
tensor->op == GGML_OP_NONE &&
2251-
strcmp(tensor->name, "token_embd.weight") != 0;
2262+
GGML_ASSERT(offset == 0);
2263+
GGML_ASSERT(size == ggml_nbytes(tensor));
22522264

2253-
if (quantize) {
2254-
GGML_ASSERT(offset == 0);
2255-
if (ggml_prepare_optimal_kernel(tensor, data, size) == 0) {
2256-
return;
2257-
}
2258-
}
2259-
memcpy((char *)tensor->data + offset, data, size);
2265+
enum ggml_type repack_type = (enum ggml_type)(intptr_t)tensor->extra;
2266+
2267+
ggml_aarch64_repack_tensor(tensor, repack_type, data, size);
22602268

22612269
GGML_UNUSED(buffer);
22622270
}
22632271

22642272
static const struct ggml_backend_buffer_i ggml_backend_cpu_aarch64_buffer_i = {
22652273
/* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
22662274
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
2267-
/* .init_tensor = */ NULL, // no initialization required
2275+
/* .init_tensor = */ ggml_backend_cpu_aarch64_buffer_init_tensor,
22682276
/* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
22692277
/* .set_tensor = */ ggml_backend_cpu_aarch64_buffer_set_tensor,
2270-
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
2271-
/* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
2278+
/* .get_tensor = */ NULL,
2279+
/* .cpy_tensor = */ NULL,
22722280
/* .clear = */ ggml_backend_cpu_buffer_clear,
22732281
/* .reset = */ NULL,
22742282
};
@@ -2298,33 +2306,37 @@ ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void) {
22982306
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
22992307
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
23002308
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
2301-
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
2309+
/* .is_host = */ NULL,
23022310
},
23032311
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
2304-
/* .context = */ NULL,
2312+
/* .context = */ NULL,
23052313
};
23062314

23072315
return &ggml_backend_cpu_buffer_type_aarch64;
23082316
}
2309-
#endif
2317+
2318+
bool ggml_backend_cpu_buft_is_aarch64(ggml_backend_buffer_type_t buft) {
2319+
return buft == ggml_backend_cpu_aarch64_buffer_type();
2320+
}
23102321

23112322
static ggml_backend_buffer_type_t * ggml_backend_cpu_get_extra_bufts(ggml_backend_dev_t device) {
2312-
static ggml_backend_buffer_type_t bufts[3];
2313-
int index = 0;
2323+
static std::vector<ggml_backend_buffer_type_t> bufts = []() {
2324+
std::vector<ggml_backend_buffer_type_t> bufts;
23142325

23152326
#ifdef GGML_USE_CPU_HBM
2316-
bufts[index++] = ggml_backend_cpu_hbm_buffer_type();
2327+
bufts.push_back(ggml_backend_cpu_hbm_buffer_type());
23172328
#endif
23182329

23192330
#ifdef GGML_USE_CPU_AARCH64
2320-
if (ggml_cpu_has_neon() || ggml_cpu_has_matmul_int8() || ggml_cpu_has_sve()) {
2321-
bufts[index++] = ggml_backend_cpu_aarch64_buffer_type();
2322-
}
2331+
bufts.push_back(ggml_backend_cpu_aarch64_buffer_type());
23232332
#endif
23242333

2325-
bufts[index] = NULL; // Terminate the list
2334+
bufts.push_back(NULL);
2335+
2336+
return bufts;
2337+
}();
23262338

2327-
return bufts;
2339+
return bufts.data();
23282340

23292341
GGML_UNUSED(device);
23302342
}
@@ -2635,15 +2647,21 @@ static ggml_backend_buffer_t ggml_backend_cpu_device_buffer_from_host_ptr(ggml_b
26352647
}
26362648

26372649
static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
2638-
#ifdef GGML_USE_CPU_AARCH64
2639-
const struct ggml_tensor *tensor = op->src[0];
2640-
if (tensor && tensor->buffer && (strcmp(tensor->buffer->buft->iface.get_name(tensor->buffer->buft),"CPU_AARCH64") == 0)) {
2641-
if (op->op == GGML_OP_MUL_MAT && tensor->type == GGML_TYPE_Q4_0) {
2642-
return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_get_type_traits_cpu(tensor->type)->vec_dot_type;
2650+
const struct ggml_tensor * src0 = op->src[0];
2651+
const struct ggml_tensor * src1 = op->src[1];
2652+
2653+
if (src0 && src0->buffer && ggml_backend_cpu_buft_is_aarch64(src0->buffer->buft)) {
2654+
if (op->op != GGML_OP_MUL_MAT || src0->type != GGML_TYPE_Q4_0 || ggml_aarch64_get_optimal_repack_type(src0) == GGML_TYPE_Q4_0) {
2655+
return false;
26432656
}
2644-
return false;
26452657
}
2646-
#endif
2658+
2659+
for (int i = 1; i < GGML_MAX_SRC; i++) {
2660+
if (op->src[i] && op->src[i]->buffer && ggml_backend_cpu_buft_is_aarch64(op->src[i]->buffer->buft)) {
2661+
return false;
2662+
}
2663+
}
2664+
26472665
switch (op->op) {
26482666
case GGML_OP_CPY:
26492667
return
@@ -2652,13 +2670,13 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
26522670
op->type != GGML_TYPE_IQ1_S &&
26532671
op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
26542672
case GGML_OP_MUL_MAT:
2655-
return op->src[1]->type == GGML_TYPE_F32;// FIXME || op->src[1]->type == ggml_get_type_traits(op->src[0]->type)->vec_dot_type;
2673+
return src1->type == GGML_TYPE_F32 || src1->type == ggml_get_type_traits_cpu(src0->type)->vec_dot_type;
26562674
case GGML_OP_ROPE_BACK:
26572675
return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
26582676
case GGML_OP_IM2COL_BACK:
2659-
return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
2677+
return src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32;
26602678
case GGML_OP_OUT_PROD:
2661-
return (op->src[0]->type == GGML_TYPE_F32 || ggml_is_quantized(op->src[0]->type)) && op->src[1]->type == GGML_TYPE_F32;
2679+
return (src0->type == GGML_TYPE_F32 || ggml_is_quantized(src0->type)) && src1->type == GGML_TYPE_F32;
26622680
default:
26632681
return true;
26642682
}
@@ -2667,7 +2685,7 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
26672685
}
26682686

26692687
static bool ggml_backend_cpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
2670-
return ggml_backend_buft_is_host(buft);
2688+
return ggml_backend_buft_is_host(buft) || ggml_backend_cpu_buft_is_aarch64(buft);
26712689

26722690
GGML_UNUSED(dev);
26732691
}
@@ -2721,7 +2739,7 @@ static void * ggml_backend_cpu_get_proc_address(ggml_backend_reg_t reg, const ch
27212739
if (strcmp(name, "ggml_backend_set_n_threads") == 0) {
27222740
return (void *)ggml_backend_cpu_set_n_threads;
27232741
}
2724-
if (strcmp(name, "ggml_backend_cpu_get_extra_bufts") == 0) {
2742+
if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0) {
27252743
return (void *)ggml_backend_cpu_get_extra_bufts;
27262744
}
27272745

@@ -2738,6 +2756,9 @@ static const struct ggml_backend_reg_i ggml_backend_cpu_reg_i = {
27382756
};
27392757

27402758
ggml_backend_reg_t ggml_backend_cpu_reg(void) {
2759+
// init CPU feature detection
2760+
ggml_cpu_init();
2761+
27412762
static struct ggml_backend_reg ggml_backend_cpu_reg = {
27422763
/* .iface = */ ggml_backend_cpu_reg_i,
27432764
/* .context = */ NULL,

0 commit comments

Comments
 (0)