From 06e7340eceb7f96363453adea68850a6dbcde6b6 Mon Sep 17 00:00:00 2001
From: Wagner Bruna <wbruna@users.noreply.github.com>
Date: Thu, 14 Aug 2025 11:12:53 -0300
Subject: [PATCH 1/2] refactor: remove clip_skip persistent attribute

All handlers are constructed with the default clip_skip value, and
it is always set during inference time, so there isn't much point
in keeping it as a persistent attribute. Instead, just propagate
the parameter value down from get_learned_condition*.
---
 clip.hpp             | 37 +++++++++----------------
 conditioner.hpp      | 65 ++++++++++++++------------------------------
 stable-diffusion.cpp |  1 -
 3 files changed, 34 insertions(+), 69 deletions(-)
diff --git a/clip.hpp b/clip.hpp
index 7ca565d9..121583a2 100644
--- a/clip.hpp
+++ b/clip.hpp
@@ -674,12 +674,10 @@ class CLIPTextModel : public GGMLBlock {
     int32_t n_head            = 12;
     int32_t n_layer           = 12;    // num_hidden_layers
     int32_t projection_dim    = 1280;  // only for OPEN_CLIP_VIT_BIGG_14
-    int32_t clip_skip         = -1;
     bool with_final_ln        = true;
 
     CLIPTextModel(CLIPVersion version = OPENAI_CLIP_VIT_L_14,
-                  bool with_final_ln  = true,
-                  int clip_skip_value = -1)
+                  bool with_final_ln  = true)
         : version(version), with_final_ln(with_final_ln) {
         if (version == OPEN_CLIP_VIT_H_14) {
             hidden_size       = 1024;
@@ -692,20 +690,12 @@ class CLIPTextModel : public GGMLBlock {
             n_head            = 20;
             n_layer           = 32;
         }
-        set_clip_skip(clip_skip_value);
 
         blocks["embeddings"]       = std::shared_ptr<GGMLBlock>(new CLIPEmbeddings(hidden_size, vocab_size, n_token));
         blocks["encoder"]          = std::shared_ptr<GGMLBlock>(new CLIPEncoder(n_layer, hidden_size, n_head, intermediate_size));
         blocks["final_layer_norm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size));
     }
 
-    void set_clip_skip(int skip) {
-        if (skip <= 0) {
-            skip = -1;
-        }
-        clip_skip = skip;
-    }
-
     struct ggml_tensor* get_token_embed_weight() {
         auto embeddings = std::dynamic_pointer_cast<CLIPEmbeddings>(blocks["embeddings"]);
         return embeddings->get_token_embed_weight();
@@ -715,7 +705,8 @@ class CLIPTextModel : public GGMLBlock {
                                 struct ggml_tensor* input_ids,
                                 struct ggml_tensor* tkn_embeddings,
                                 size_t max_token_idx = 0,
-                                bool return_pooled   = false) {
+                                bool return_pooled   = false,
+                                int clip_skip        = -1) {
         // input_ids: [N, n_token]
         auto embeddings       = std::dynamic_pointer_cast<CLIPEmbeddings>(blocks["embeddings"]);
         auto encoder          = std::dynamic_pointer_cast<CLIPEncoder>(blocks["encoder"]);
@@ -871,9 +862,8 @@ struct CLIPTextModelRunner : public GGMLRunner {
                         const String2GGMLType& tensor_types,
                         const std::string prefix,
                         CLIPVersion version = OPENAI_CLIP_VIT_L_14,
-                        bool with_final_ln  = true,
-                        int clip_skip_value = -1)
-        : GGMLRunner(backend), model(version, with_final_ln, clip_skip_value) {
+                        bool with_final_ln  = true)
+        : GGMLRunner(backend), model(version, with_final_ln) {
         model.init(params_ctx, tensor_types, prefix);
     }
 
@@ -881,10 +871,6 @@ struct CLIPTextModelRunner : public GGMLRunner {
         return "clip";
     }
 
-    void set_clip_skip(int clip_skip) {
-        model.set_clip_skip(clip_skip);
-    }
-
     void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
         model.get_param_tensors(tensors, prefix);
     }
@@ -893,7 +879,8 @@ struct CLIPTextModelRunner : public GGMLRunner {
                                 struct ggml_tensor* input_ids,
                                 struct ggml_tensor* embeddings,
                                 size_t max_token_idx = 0,
-                                bool return_pooled   = false) {
+                                bool return_pooled   = false,
+                                int clip_skip        = -1) {
         size_t N       = input_ids->ne[1];
         size_t n_token = input_ids->ne[0];
         if (input_ids->ne[0] > model.n_token) {
@@ -901,14 +888,15 @@ struct CLIPTextModelRunner : public GGMLRunner {
             input_ids = ggml_reshape_2d(ctx, input_ids, model.n_token, input_ids->ne[0] / model.n_token);
         }
 
-        return model.forward(ctx, input_ids, embeddings, max_token_idx, return_pooled);
+        return model.forward(ctx, input_ids, embeddings, max_token_idx, return_pooled, clip_skip);
     }
 
     struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids,
                                     int num_custom_embeddings    = 0,
                                     void* custom_embeddings_data = NULL,
                                     size_t max_token_idx         = 0,
-                                    bool return_pooled           = false) {
+                                    bool return_pooled           = false,
+                                    int clip_skip                = -1) {
         struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);
 
         input_ids = to_backend(input_ids);
@@ -927,7 +915,7 @@ struct CLIPTextModelRunner : public GGMLRunner {
             embeddings = ggml_concat(compute_ctx, token_embed_weight, custom_embeddings, 1);
         }
 
-        struct ggml_tensor* hidden_states = forward(compute_ctx, input_ids, embeddings, max_token_idx, return_pooled);
+        struct ggml_tensor* hidden_states = forward(compute_ctx, input_ids, embeddings, max_token_idx, return_pooled, clip_skip);
 
         ggml_build_forward_expand(gf, hidden_states);
 
@@ -940,10 +928,11 @@ struct CLIPTextModelRunner : public GGMLRunner {
                  void* custom_embeddings_data,
                  size_t max_token_idx,
                  bool return_pooled,
+                 int clip_skip,
                  ggml_tensor** output,
                  ggml_context* output_ctx = NULL) {
         auto get_graph = [&]() -> struct ggml_cgraph* {
-            return build_graph(input_ids, num_custom_embeddings, custom_embeddings_data, max_token_idx, return_pooled);
+            return build_graph(input_ids, num_custom_embeddings, custom_embeddings_data, max_token_idx, return_pooled, clip_skip);
         };
         GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx);
     }
diff --git a/conditioner.hpp b/conditioner.hpp
index 6a51dce8..8f3032e4 100644
--- a/conditioner.hpp
+++ b/conditioner.hpp
@@ -60,8 +60,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                                       const String2GGMLType& tensor_types,
                                       const std::string& embd_dir,
                                       SDVersion version = VERSION_SD1,
-                                      PMVersion pv      = PM_VERSION_1,
-                                      int clip_skip     = -1)
+                                      PMVersion pv      = PM_VERSION_1)
         : version(version), pm_version(pv), tokenizer(sd_version_is_sd2(version) ? 0 : 49407), embd_dir(embd_dir) {
         if (sd_version_is_sd1(version)) {
             text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14);
@@ -71,20 +70,6 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
             text_model  = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
             text_model2 = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
         }
-        set_clip_skip(clip_skip);
-    }
-
-    void set_clip_skip(int clip_skip) {
-        if (clip_skip <= 0) {
-            clip_skip = 1;
-            if (sd_version_is_sd2(version) || sd_version_is_sdxl(version)) {
-                clip_skip = 2;
-            }
-        }
-        text_model->set_clip_skip(clip_skip);
-        if (sd_version_is_sdxl(version)) {
-            text_model2->set_clip_skip(clip_skip);
-        }
     }
 
     void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
@@ -411,7 +396,10 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                                              int height,
                                              int adm_in_channels        = -1,
                                              bool force_zero_embeddings = false) {
-        set_clip_skip(clip_skip);
+        if (clip_skip <= 0) {
+            clip_skip = (sd_version_is_sd2(version) || sd_version_is_sdxl(version)) ? 2 : 1;
+        }
+
         int64_t t0                               = ggml_time_ms();
         struct ggml_tensor* hidden_states        = NULL;  // [N, n_token, hidden_size]
         struct ggml_tensor* chunk_hidden_states  = NULL;  // [n_token, hidden_size] or [n_token, hidden_size + hidden_size2]
@@ -454,6 +442,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                                     token_embed_custom.data(),
                                     max_token_idx,
                                     false,
+                                    clip_skip,
                                     &chunk_hidden_states1,
                                     work_ctx);
                 if (sd_version_is_sdxl(version)) {
@@ -463,6 +452,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                                          token_embed_custom.data(),
                                          max_token_idx,
                                          false,
+                                         clip_skip,
                                          &chunk_hidden_states2, work_ctx);
                     // concat
                     chunk_hidden_states = ggml_tensor_concat(work_ctx, chunk_hidden_states1, chunk_hidden_states2, 0);
@@ -474,6 +464,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                                              token_embed_custom.data(),
                                              max_token_idx,
                                              true,
+                                             clip_skip,
                                              &pooled,
                                              work_ctx);
                     }
@@ -663,21 +654,11 @@ struct SD3CLIPEmbedder : public Conditioner {
     std::shared_ptr<T5Runner> t5;
 
     SD3CLIPEmbedder(ggml_backend_t backend,
-                    const String2GGMLType& tensor_types = {},
-                    int clip_skip                       = -1)
+                    const String2GGMLType& tensor_types = {})
         : clip_g_tokenizer(0) {
         clip_l = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
         clip_g = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
         t5     = std::make_shared<T5Runner>(backend, tensor_types, "text_encoders.t5xxl.transformer");
-        set_clip_skip(clip_skip);
-    }
-
-    void set_clip_skip(int clip_skip) {
-        if (clip_skip <= 0) {
-            clip_skip = 2;
-        }
-        clip_l->set_clip_skip(clip_skip);
-        clip_g->set_clip_skip(clip_skip);
     }
 
     void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
@@ -774,7 +755,9 @@ struct SD3CLIPEmbedder : public Conditioner {
                                              std::vector<std::pair<std::vector<int>, std::vector<float>>> token_and_weights,
                                              int clip_skip,
                                              bool force_zero_embeddings = false) {
-        set_clip_skip(clip_skip);
+        if (clip_skip <= 0) {
+            clip_skip = 2;
+        }
         auto& clip_l_tokens  = token_and_weights[0].first;
         auto& clip_l_weights = token_and_weights[0].second;
         auto& clip_g_tokens  = token_and_weights[1].first;
@@ -812,6 +795,7 @@ struct SD3CLIPEmbedder : public Conditioner {
                                 NULL,
                                 max_token_idx,
                                 false,
+                                clip_skip,
                                 &chunk_hidden_states_l,
                                 work_ctx);
                 {
@@ -839,6 +823,7 @@ struct SD3CLIPEmbedder : public Conditioner {
                                     NULL,
                                     max_token_idx,
                                     true,
+                                    clip_skip,
                                     &pooled_l,
                                     work_ctx);
                 }
@@ -860,6 +845,7 @@ struct SD3CLIPEmbedder : public Conditioner {
                                 NULL,
                                 max_token_idx,
                                 false,
+                                clip_skip,
                                 &chunk_hidden_states_g,
                                 work_ctx);
 
@@ -888,6 +874,7 @@ struct SD3CLIPEmbedder : public Conditioner {
                                     NULL,
                                     max_token_idx,
                                     true,
+                                    clip_skip,
                                     &pooled_g,
                                     work_ctx);
                 }
@@ -1010,18 +997,9 @@ struct FluxCLIPEmbedder : public Conditioner {
     size_t chunk_len = 256;
 
     FluxCLIPEmbedder(ggml_backend_t backend,
-                     const String2GGMLType& tensor_types = {},
-                     int clip_skip                       = -1) {
+                     const String2GGMLType& tensor_types = {}) {
         clip_l = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true);
         t5     = std::make_shared<T5Runner>(backend, tensor_types, "text_encoders.t5xxl.transformer");
-        set_clip_skip(clip_skip);
-    }
-
-    void set_clip_skip(int clip_skip) {
-        if (clip_skip <= 0) {
-            clip_skip = 2;
-        }
-        clip_l->set_clip_skip(clip_skip);
     }
 
     void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
@@ -1102,7 +1080,9 @@ struct FluxCLIPEmbedder : public Conditioner {
                                              std::vector<std::pair<std::vector<int>, std::vector<float>>> token_and_weights,
                                              int clip_skip,
                                              bool force_zero_embeddings = false) {
-        set_clip_skip(clip_skip);
+        if (clip_skip <= 0) {
+            clip_skip = 2;
+        }
         auto& clip_l_tokens  = token_and_weights[0].first;
         auto& clip_l_weights = token_and_weights[0].second;
         auto& t5_tokens      = token_and_weights[1].first;
@@ -1136,6 +1116,7 @@ struct FluxCLIPEmbedder : public Conditioner {
                                 NULL,
                                 max_token_idx,
                                 true,
+                                clip_skip,
                                 &pooled,
                                 work_ctx);
             }
@@ -1232,16 +1213,12 @@ struct PixArtCLIPEmbedder : public Conditioner {
 
     PixArtCLIPEmbedder(ggml_backend_t backend,
                        const String2GGMLType& tensor_types = {},
-                       int clip_skip                       = -1,
                        bool use_mask                       = false,
                        int mask_pad                        = 1)
         : use_mask(use_mask), mask_pad(mask_pad) {
         t5 = std::make_shared<T5Runner>(backend, tensor_types, "text_encoders.t5xxl.transformer");
     }
 
-    void set_clip_skip(int clip_skip) {
-    }
-
     void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
         t5->get_param_tensors(tensors, "text_encoders.t5xxl.transformer");
     }
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index c5448f92..7a896774 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -346,7 +346,6 @@ class StableDiffusionGGML {
                 if (is_chroma) {
                     cond_stage_model = std::make_shared<PixArtCLIPEmbedder>(clip_backend,
                                                                             model_loader.tensor_storages_types,
-                                                                            -1,
                                                                             sd_ctx_params->chroma_use_t5_mask,
                                                                             sd_ctx_params->chroma_t5_mask_pad);
                 } else {

From 42f2fa662c91beea9fd36a00aecedf58625d4e30 Mon Sep 17 00:00:00 2001
From: Wagner Bruna <wbruna@users.noreply.github.com>
Date: Thu, 14 Aug 2025 11:50:00 -0300
Subject: [PATCH 2/2] feat: reduce CLIP memory usage with no embeddings

The CLIP weights need to be converted to f32 for textual inversions
(fbd42b6fc16d14fbd362993fa1d083740a05f113), but that increases the
amount of allocated VRAM even when embeddings aren't being used.
---
 clip.hpp        | 22 ++++++++++++++++------
 conditioner.hpp |  9 +++++----
 2 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/clip.hpp b/clip.hpp
index 121583a2..beecd88f 100644
--- a/clip.hpp
+++ b/clip.hpp
@@ -544,9 +544,15 @@ class CLIPEmbeddings : public GGMLBlock {
     int64_t embed_dim;
     int64_t vocab_size;
     int64_t num_positions;
+    bool force_clip_f32;
 
     void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") {
         enum ggml_type token_wtype    = GGML_TYPE_F32;
+        if (!force_clip_f32) {
+            auto tensor_type = tensor_types.find(prefix + "token_embedding.weight");
+            if (tensor_type != tensor_types.end())
+                token_wtype = tensor_type->second;
+        }
         enum ggml_type position_wtype = GGML_TYPE_F32;
 
         params["token_embedding.weight"]    = ggml_new_tensor_2d(ctx, token_wtype, embed_dim, vocab_size);
@@ -556,10 +562,12 @@ class CLIPEmbeddings : public GGMLBlock {
 public:
     CLIPEmbeddings(int64_t embed_dim,
                    int64_t vocab_size    = 49408,
-                   int64_t num_positions = 77)
+                   int64_t num_positions = 77,
+                   bool force_clip_f32   = false)
         : embed_dim(embed_dim),
           vocab_size(vocab_size),
-          num_positions(num_positions) {
+          num_positions(num_positions),
+          force_clip_f32(force_clip_f32) {
     }
 
     struct ggml_tensor* get_token_embed_weight() {
@@ -677,7 +685,8 @@ class CLIPTextModel : public GGMLBlock {
     bool with_final_ln        = true;
 
     CLIPTextModel(CLIPVersion version = OPENAI_CLIP_VIT_L_14,
-                  bool with_final_ln  = true)
+                  bool with_final_ln  = true,
+                  bool force_clip_f32 = false)
         : version(version), with_final_ln(with_final_ln) {
         if (version == OPEN_CLIP_VIT_H_14) {
             hidden_size       = 1024;
@@ -691,7 +700,7 @@ class CLIPTextModel : public GGMLBlock {
             n_layer           = 32;
         }
 
-        blocks["embeddings"]       = std::shared_ptr<GGMLBlock>(new CLIPEmbeddings(hidden_size, vocab_size, n_token));
+        blocks["embeddings"]       = std::shared_ptr<GGMLBlock>(new CLIPEmbeddings(hidden_size, vocab_size, n_token, force_clip_f32));
         blocks["encoder"]          = std::shared_ptr<GGMLBlock>(new CLIPEncoder(n_layer, hidden_size, n_head, intermediate_size));
         blocks["final_layer_norm"] = std::shared_ptr<GGMLBlock>(new LayerNorm(hidden_size));
     }
@@ -862,8 +871,9 @@ struct CLIPTextModelRunner : public GGMLRunner {
                         const String2GGMLType& tensor_types,
                         const std::string prefix,
                         CLIPVersion version = OPENAI_CLIP_VIT_L_14,
-                        bool with_final_ln  = true)
-        : GGMLRunner(backend), model(version, with_final_ln) {
+                        bool with_final_ln  = true,
+                        bool force_clip_f32 = false)
+        : GGMLRunner(backend), model(version, with_final_ln, force_clip_f32) {
         model.init(params_ctx, tensor_types, prefix);
     }
 
diff --git a/conditioner.hpp b/conditioner.hpp
index 8f3032e4..7c47670d 100644
--- a/conditioner.hpp
+++ b/conditioner.hpp
@@ -62,13 +62,14 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                                       SDVersion version = VERSION_SD1,
                                       PMVersion pv      = PM_VERSION_1)
         : version(version), pm_version(pv), tokenizer(sd_version_is_sd2(version) ? 0 : 49407), embd_dir(embd_dir) {
+        bool force_clip_f32 = embd_dir.size() > 0;
         if (sd_version_is_sd1(version)) {
-            text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14);
+            text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, true, force_clip_f32);
         } else if (sd_version_is_sd2(version)) {
-            text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14);
+            text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14, true, force_clip_f32);
         } else if (sd_version_is_sdxl(version)) {
-            text_model  = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false);
-            text_model2 = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false);
+            text_model  = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false, force_clip_f32);
+            text_model2 = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false, force_clip_f32);
         }
     }