img pre processing

ngxson · danbev · commit 2acd1b68e408 · 2025-01-14T06:29:56.000+01:00
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
@@ -1651,8 +1651,13 @@ def set_gguf_parameters(self):
             self.gguf_writer.add_vision_clip_feed_forward_length(self.vparams["intermediate_size"])
             self.gguf_writer.add_vision_clip_head_count(self.vparams["num_attention_heads"])
             # TODO: should not hardcode these, but they are currently missing from config.json
+            self.gguf_writer.add_vision_clip_projector_type(gguf.constants.CLIPProjectorType.MLP)
             self.gguf_writer.add_vision_clip_max_position_embeddings(577)
             self.gguf_writer.add_vision_clip_layer_norm_epsilon(1e-05)
+            default_image_mean = [0.48145466, 0.4578275, 0.40821073]
+            default_image_std = [0.26862954, 0.26130258, 0.27577711]
+            self.gguf_writer.add_vision_clip_image_mean(default_image_mean)
+            self.gguf_writer.add_vision_clip_image_std(default_image_std)
 
     @staticmethod
     def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
@@ -229,6 +229,7 @@ class Clip:
             PROJECTION_DIM      = "vision.clip.projection_dim"
             USE_GELU            = "vision.clip.use_gelu"
             MAX_POS_EMBEDDING   = "vision.clip.max_position_embeddings"
+            PROJECTOR_TYPE      = "vision.clip.projector_type"
             HEAD_COUNT          = "vision.clip.attention.head_count"
             LAYERNORM_EPS       = "vision.clip.attention.layer_norm_epsilon"
 
@@ -1689,6 +1690,10 @@ class PoolingType(IntEnum):
     CLS  = 2
 
 
+class CLIPProjectorType(Enum):
+    MLP = 'mlp'
+
+
 class GGMLQuantizationType(IntEnum):
     F32     = 0
     F16     = 1
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
@@ -27,6 +27,7 @@
     PoolingType,
     TokenType,
     ExpertGatingFuncType,
+    CLIPProjectorType,
 )
 
 from .quants import quant_shape_from_byte_shape
@@ -905,9 +906,18 @@ def add_vision_clip_head_count(self, value: int) -> None:
     def add_vision_clip_max_position_embeddings(self, value: int) -> None:
         self.add_uint32(Keys.Vision.Clip.MAX_POS_EMBEDDING, value)
 
+    def add_vision_clip_projector_type(self, value: CLIPProjectorType) -> None:
+        self.add_string(Keys.Vision.Clip.PROJECTOR_TYPE, value.value)
+
     def add_vision_clip_layer_norm_epsilon(self, value: float) -> None:
         self.add_float32(Keys.Vision.Clip.LAYERNORM_EPS, value)
 
+    def add_vision_clip_image_mean(self, value: Sequence[float]) -> None:
+        self.add_array(Keys.Vision.IMAGE_MEAN, value)
+
+    def add_vision_clip_image_std(self, value: Sequence[float]) -> None:
+        self.add_array(Keys.Vision.IMAGE_STD, value)
+
     def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None:
         if not isinstance(value, str):
             template_default = None
diff --git a/src/llama-arch.h b/src/llama-arch.h
@@ -215,6 +215,7 @@ enum llm_kv {
     LLM_KV_VISION_CLIP_PROJECTION_TYPE,
     LLM_KV_VISION_CLIP_PROJECTION_DIM,
     LLM_KV_VISION_CLIP_USE_GELU,
+    LLM_KV_VISION_CLIP_PROJECTOR_TYPE,
     LLM_KV_VISION_CLIP_HEAD_COUNT,
     LLM_KV_VISION_CLIP_MAX_POS_EMBD,
     LLM_KV_VISION_CLIP_LAYERNORM_EPS,
diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
@@ -412,6 +412,7 @@ namespace GGUFMeta {
     // TODO: this is not very clever - figure out something better
     template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required);
     template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);
+    template bool llama_model_loader::get_key_or_arr<std::array<float, 3>>(enum llm_kv kid, std::array<float, 3> & result, uint32_t n, bool required);
 
 llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, const struct llama_model_kv_override * param_overrides_p) {
     int trace = 0;
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -501,30 +501,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
     uint32_t n_vocab = 0;
     ml.get_key(LLM_KV_VOCAB_SIZE, n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, n_vocab, false);
 
-    std::string vision_type;
-    ml.get_key(LLM_KV_VISION_TYPE, vision_type, false);
-    if (vision_type == "clip") {
-        hparams.has_vision = true;
-        ml.get_key(LLM_KV_VISION_IMAGE_SIZE, hparams.clip.image_size, true);
-        ml.get_key(LLM_KV_VISION_PATCH_SIZE, hparams.clip.patch_size, true);
-        ml.get_key(LLM_KV_VISION_CLIP_EMBEDDING_LENGTH, hparams.clip.hidden_size, true);
-        ml.get_key(LLM_KV_VISION_CLIP_BLOCK_COUNT, hparams.clip.n_layer, true);
-        ml.get_key(LLM_KV_VISION_CLIP_FEED_FORWARD_LENGTH, hparams.clip.n_intermediate, true);
-        ml.get_key(LLM_KV_VISION_CLIP_HEAD_COUNT, hparams.clip.n_head, true);
-        ml.get_key(LLM_KV_VISION_CLIP_LAYERNORM_EPS, hparams.clip.eps, true);
-        // TODO: add image_std
-        std::string arch;
-        ml.get_key(LLM_KV_VISION_CLIP_ARCHITECTURE, arch, true);
-        for (auto & it : VISION_ARCH_NAMES) {
-            if (arch == it.second) {
-                hparams.clip.arch = it.first;
-                break;
-            }
-        }
-    } else if (!vision_type.empty()) {
-        throw std::runtime_error(format("unsupported vision type: %s", vision_type.c_str()));
-    }
-
     using e_model = llm_type; // TMP
 
     // arch-specific KVs
@@ -1262,6 +1238,39 @@ void llama_model::load_hparams(llama_model_loader & ml) {
         default: throw std::runtime_error("unsupported model architecture");
     }
     
+    // vision model
+    std::string vision_type;
+    ml.get_key(LLM_KV_VISION_TYPE, vision_type, false);
+    if (vision_type == "clip") {
+        hparams.has_vision = true;
+        std::string proj_type;
+        ml.get_key(LLM_KV_VISION_IMAGE_SIZE,               hparams.clip.image_size,     true);
+        ml.get_key(LLM_KV_VISION_PATCH_SIZE,               hparams.clip.patch_size,     true);
+        ml.get_key_or_arr(LLM_KV_VISION_IMAGE_MEAN,        hparams.clip.image_mean, 3,  true);
+        ml.get_key_or_arr(LLM_KV_VISION_IMAGE_STD,         hparams.clip.image_std,  3,  true);
+        ml.get_key(LLM_KV_VISION_CLIP_EMBEDDING_LENGTH,    hparams.clip.hidden_size,    true);
+        ml.get_key(LLM_KV_VISION_CLIP_BLOCK_COUNT,         hparams.clip.n_layer,        true);
+        ml.get_key(LLM_KV_VISION_CLIP_FEED_FORWARD_LENGTH, hparams.clip.n_intermediate, true);
+        ml.get_key(LLM_KV_VISION_CLIP_HEAD_COUNT,          hparams.clip.n_head,         true);
+        ml.get_key(LLM_KV_VISION_CLIP_LAYERNORM_EPS,       hparams.clip.eps,            true);
+        ml.get_key(LLM_KV_VISION_CLIP_PROJECTOR_TYPE,      proj_type,                   true);
+        if (proj_type == "mlp") {
+            hparams.clip.proj_type = CLIP_PROJECTOR_TYPE_MLP;
+        } else {
+            throw std::runtime_error(format("unsupported clip projector type: %s", proj_type.c_str()));
+        }
+        std::string arch;
+        ml.get_key(LLM_KV_VISION_CLIP_ARCHITECTURE, arch, true);
+        for (auto & it : VISION_ARCH_NAMES) {
+            if (arch == it.second) {
+                hparams.clip.arch = it.first;
+                break;
+            }
+        }
+    } else if (!vision_type.empty()) {
+        throw std::runtime_error(format("unsupported vision type: %s", vision_type.c_str()));
+    }
+
     // arch-specific CLIP hparams
     switch (hparams.clip.arch) {
         case VISION_ARCH_LLAVA:
diff --git a/src/llama-vision.cpp b/src/llama-vision.cpp
diff --git a/src/llama-vision.h b/src/llama-vision.h