Skip to content

Commit 7a74dee

Browse files
committed
llama : temporary disable Q6_K output quantization (#1711)
1 parent 590250f commit 7a74dee

File tree

1 file changed

+9
-4
lines changed

1 file changed

+9
-4
lines changed

llama.cpp

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2198,27 +2198,32 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
21982198
printf("size = %8.3f MB\n", tensor.size/1024.0/1024.0);
21992199
} else {
22002200
new_type = quantized_type;
2201-
if (tensor.name == "output.weight") new_type = GGML_TYPE_Q6_K;
2202-
else if (tensor.name.find("attention.wv.weight") != std::string::npos) {
2201+
// TODO: temporary disabled until Metal / OpenCL support is available
2202+
// ref: https://github.com/ggerganov/llama.cpp/issues/1711
2203+
//if (tensor.name == "output.weight") {
2204+
// new_type = GGML_TYPE_Q6_K;
2205+
//}
2206+
if (tensor.name.find("attention.wv.weight") != std::string::npos) {
22032207
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
22042208
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
22052209
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
22062210
(i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8 ||
22072211
(i_attention_wv - n_attention_wv/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
22082212
++i_attention_wv;
22092213
}
2210-
else if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
2214+
if (tensor.name.find("feed_forward.w2.weight") != std::string::npos) {
22112215
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
22122216
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
22132217
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
22142218
(i_feed_forward_w2 < n_feed_forward_w2/8 || i_feed_forward_w2 >= 7*n_feed_forward_w2/8 ||
22152219
(i_feed_forward_w2 - n_feed_forward_w2/8)%3 == 2)) new_type = GGML_TYPE_Q6_K;
22162220
++i_feed_forward_w2;
22172221
}
2218-
else if (tensor.name.find("attention.wo.weight") != std::string::npos) {
2222+
if (tensor.name.find("attention.wo.weight") != std::string::npos) {
22192223
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
22202224
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
22212225
}
2226+
22222227
float * f32_data;
22232228
size_t nelements = tensor.ne.at(0) * tensor.ne.at(1);
22242229
llama_buffer f32_conv_buf;

0 commit comments

Comments
 (0)