refactor(tx): compatible compvis format

thxCode · thxCode · commit 90f9934b98c6 · 2024-12-29T17:16:13.000+08:00
Signed-off-by: thxCode &lt;thxcode0824@gmail.com&gt;
diff --git a/conditioner.hpp b/conditioner.hpp
@@ -43,6 +43,7 @@ struct Conditioner {
 // ldm.modules.encoders.modules.FrozenCLIPEmbedder
 // Ref: https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/cad87bf4e3e0b0a759afa94e933527c3123d59bc/modules/sd_hijack_clip.py#L283
 struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
+    bool cc_clip_l, cc_clip_g;
     SDVersion version    = VERSION_SD1;
     PMVersion pm_version = PM_VERSION_1;
     CLIPTokenizer tokenizer;
@@ -60,26 +61,38 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                                       const std::string& embd_dir,
                                       SDVersion version = VERSION_SD1,
                                       PMVersion pv      = PM_VERSION_1,
-                                      int clip_skip     = -1)
+                                      int clip_skip     = -1,
+                                      bool cc_clip_l    = false,
+                                      bool cc_clip_g    = false)
         : version(version), pm_version(pv), tokenizer(sd_version_is_sd2(version) ? 0 : 49407), embd_dir(embd_dir) {
         if (clip_skip <= 0) {
             clip_skip = 1;
             if (sd_version_is_sd2(version) || sd_version_is_sdxl(version)) {
                 clip_skip = 2;
             }
         }
+        this->cc_clip_l = cc_clip_l;
+        this->cc_clip_g = cc_clip_g;
         if (sd_version_is_sd1(version)) {
             text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, clip_skip);
         } else if (sd_version_is_sd2(version)) {
             text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14, clip_skip);
         } else if (sd_version_is_sdxl(version)) {
             if (version != VERSION_SDXL_REFINER) {
-                text_model  = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, clip_skip, false);
+                text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, clip_skip, false);
             }
             text_model2 = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, clip_skip, false);
         }
     }
 
+    std::string clip_l_prefix() {
+        return cc_clip_l ? "cond_stage_model.transformer.text_model" : "text_encoders.clip_l.transformer.text_model";
+    }
+
+    std::string clip_g_prefix() {
+        return cc_clip_g ? "cond_stage_model.1.transformer.text_model" : "text_encoders.clip_g.transformer.text_model";
+    }
+
     void set_clip_skip(int clip_skip) {
         if (text_model) {
             text_model->set_clip_skip(clip_skip);
@@ -91,10 +104,10 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
 
     void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
         if (text_model) {
-            text_model->get_param_tensors(tensors, "cond_stage_model.transformer.text_model");
+            text_model->get_param_tensors(tensors, clip_l_prefix());
         }
         if (text_model2) {
-            text_model2->get_param_tensors(tensors, "cond_stage_model.1.transformer.text_model");
+            text_model2->get_param_tensors(tensors, clip_g_prefix());
         }
     }
 
@@ -600,19 +613,21 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
 };
 
 struct FrozenCLIPVisionEmbedder : public GGMLRunner {
+    bool cc_clip_l;
     CLIPVisionModelProjection vision_model;
 
-    FrozenCLIPVisionEmbedder(ggml_backend_t backend, std::map<std::string, enum ggml_type>& tensor_types)
+    FrozenCLIPVisionEmbedder(ggml_backend_t backend, std::map<std::string, enum ggml_type>& tensor_types, bool cc_clip_l = false)
         : vision_model(OPEN_CLIP_VIT_H_14, true), GGMLRunner(backend) {
-        vision_model.init(params_ctx, tensor_types, "cond_stage_model.transformer");
+        this->cc_clip_l = cc_clip_l;
+        vision_model.init(params_ctx, tensor_types, cc_clip_l ? "cond_stage_model.transformer" : "text_encoders.clip_l.transformer");
     }
 
     std::string get_desc() {
         return "clip_vision";
     }
 
     void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
-        vision_model.get_param_tensors(tensors, "cond_stage_model.transformer");
+        vision_model.get_param_tensors(tensors, cc_clip_l ? "cond_stage_model.transformer" : "text_encoders.clip_l.transformer");
     }
 
     struct ggml_cgraph* build_graph(struct ggml_tensor* pixel_values) {
@@ -639,6 +654,7 @@ struct FrozenCLIPVisionEmbedder : public GGMLRunner {
 };
 
 struct SD3CLIPEmbedder : public Conditioner {
+    bool cc_clip_l, cc_clip_g, cc_t5xxl;
     CLIPTokenizer clip_l_tokenizer;
     CLIPTokenizer clip_g_tokenizer;
     T5UniGramTokenizer t5_tokenizer;
@@ -648,14 +664,32 @@ struct SD3CLIPEmbedder : public Conditioner {
 
     SD3CLIPEmbedder(ggml_backend_t backend,
                     std::map<std::string, enum ggml_type>& tensor_types,
-                    int clip_skip = -1)
+                    int clip_skip  = -1,
+                    bool cc_clip_l = false,
+                    bool cc_clip_g = false,
+                    bool cc_t5xxl  = false)
         : clip_g_tokenizer(0) {
         if (clip_skip <= 0) {
             clip_skip = 2;
         }
-        clip_l = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, clip_skip, false);
-        clip_g = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, clip_skip, false);
-        t5     = std::make_shared<T5Runner>(backend, tensor_types, "text_encoders.t5xxl.transformer");
+        this->cc_clip_l = cc_clip_l;
+        this->cc_clip_g = cc_clip_g;
+        this->cc_t5xxl  = cc_t5xxl;
+        clip_l          = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, clip_l_prefix(), OPENAI_CLIP_VIT_L_14, clip_skip, false);
+        clip_g          = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, clip_g_prefix(), OPEN_CLIP_VIT_BIGG_14, clip_skip, false);
+        t5              = std::make_shared<T5Runner>(backend, tensor_types, t5xxl_prefix());
+    }
+
+    std::string clip_l_prefix() {
+        return cc_clip_l ? "cond_stage_model.transformer.text_model" : "text_encoders.clip_l.transformer.text_model";
+    }
+
+    std::string clip_g_prefix() {
+        return cc_clip_g ? "cond_stage_model.1.transformer.text_model" : "text_encoders.clip_g.transformer.text_model";
+    }
+
+    std::string t5xxl_prefix() {
+        return cc_t5xxl ? "cond_stage_model.2.transformer" : "text_encoders.t5xxl.transformer";
     }
 
     void set_clip_skip(int clip_skip) {
@@ -664,9 +698,9 @@ struct SD3CLIPEmbedder : public Conditioner {
     }
 
     void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
-        clip_l->get_param_tensors(tensors, "text_encoders.clip_l.transformer.text_model");
-        clip_g->get_param_tensors(tensors, "text_encoders.clip_g.transformer.text_model");
-        t5->get_param_tensors(tensors, "text_encoders.t5xxl.transformer");
+        clip_l->get_param_tensors(tensors, clip_l_prefix());
+        clip_g->get_param_tensors(tensors, clip_g_prefix());
+        t5->get_param_tensors(tensors, t5xxl_prefix());
     }
 
     void alloc_params_buffer() {
@@ -985,28 +1019,41 @@ struct SD3CLIPEmbedder : public Conditioner {
 };
 
 struct FluxCLIPEmbedder : public Conditioner {
+    bool cc_clip_l, cc_t5xxl;
     CLIPTokenizer clip_l_tokenizer;
     T5UniGramTokenizer t5_tokenizer;
     std::shared_ptr<CLIPTextModelRunner> clip_l;
     std::shared_ptr<T5Runner> t5;
 
     FluxCLIPEmbedder(ggml_backend_t backend,
                      std::map<std::string, enum ggml_type>& tensor_types,
-                     int clip_skip = -1) {
+                     int clip_skip  = -1,
+                     bool cc_clip_l = false,
+                     bool cc_t5xxl  = false) {
         if (clip_skip <= 0) {
             clip_skip = 2;
         }
-        clip_l = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, clip_skip, true);
-        t5     = std::make_shared<T5Runner>(backend, tensor_types, "text_encoders.t5xxl.transformer");
+        this->cc_clip_l = cc_clip_l;
+        this->cc_t5xxl  = cc_t5xxl;
+        clip_l          = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, clip_l_prefix(), OPENAI_CLIP_VIT_L_14, clip_skip, true);
+        t5              = std::make_shared<T5Runner>(backend, tensor_types, t5xxl_prefix());
+    }
+
+    std::string clip_l_prefix() {
+        return cc_clip_l ? "cond_stage_model.transformer.text_model" : "text_encoders.clip_l.transformer.text_model";
+    }
+
+    std::string t5xxl_prefix() {
+        return cc_t5xxl ? "cond_stage_model.1.transformer" : "text_encoders.t5xxl.transformer";
     }
 
     void set_clip_skip(int clip_skip) {
         clip_l->set_clip_skip(clip_skip);
     }
 
     void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
-        clip_l->get_param_tensors(tensors, "text_encoders.clip_l.transformer.text_model");
-        t5->get_param_tensors(tensors, "text_encoders.t5xxl.transformer");
+        clip_l->get_param_tensors(tensors, clip_l_prefix());
+        t5->get_param_tensors(tensors, t5xxl_prefix());
     }
 
     void alloc_params_buffer() {
diff --git a/model.cpp b/model.cpp
@@ -1469,11 +1469,11 @@ SDVersion ModelLoader::get_sd_version() {
     TensorStorage token_embedding_weight, input_block_weight;
     bool input_block_checked = false;
 
-    bool has_multiple_encoders   = false;
-    bool is_unet = false;
+    bool has_multiple_encoders = false;
+    bool is_unet               = false;
 
-    bool is_xl = false;
-    bool is_flux = false;
+    bool is_xl      = false;
+    bool is_flux    = false;
     bool is_refiner = false;
 
 #define found_family (is_xl || is_flux)
@@ -1490,7 +1490,7 @@ SDVersion ModelLoader::get_sd_version() {
             }
             if (tensor_storage.name.find("model.diffusion_model.input_blocks.") != std::string::npos) {
                 is_unet = true;
-                if(has_multiple_encoders){
+                if (has_multiple_encoders) {
                     is_xl = true;
                     if (input_block_checked) {
                         break;
@@ -1499,7 +1499,7 @@ SDVersion ModelLoader::get_sd_version() {
             }
             if (tensor_storage.name.find("conditioner.embedders.1") != std::string::npos || tensor_storage.name.find("cond_stage_model.1") != std::string::npos) {
                 has_multiple_encoders = true;
-                if(is_unet){
+                if (is_unet) {
                     is_xl = true;
                     if (input_block_checked) {
                         break;
@@ -2037,6 +2037,15 @@ int64_t ModelLoader::get_params_mem_size(ggml_backend_t backend, ggml_type type)
     return mem_size;
 }
 
+bool ModelLoader::has_prefix_tensors(const std::string& prefix) {
+    for (auto& tensor_storage : tensor_storages) {
+        if (tensor_storage.name.find(prefix) != std::string::npos) {
+            return true;
+        }
+    }
+    return false;
+}
+
 bool convert(const char* input_path, const char* vae_path, const char* output_path, sd_type_t output_type) {
     ModelLoader model_loader;
 
diff --git a/model.h b/model.h
@@ -228,6 +228,7 @@ class ModelLoader {
 
     static std::string load_merges();
     static std::string load_t5_tokenizer_json();
+    bool has_prefix_tensors(const std::string& prefix);
 };
 
 #endif  // __MODEL_H__
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
@@ -431,19 +431,24 @@ class StableDiffusionGGML {
             // TODO: shift_factor
         }
 
+        auto cc_clip_l = model_loader.has_prefix_tensors("cond_stage_model.") && !model_loader.has_prefix_tensors("text_encoders.clip_l.");
+        auto cc_clip_g = model_loader.has_prefix_tensors("cond_stage_model.") && !model_loader.has_prefix_tensors("text_encoders.clip_g.");
+        auto cc_t5xxl  = model_loader.has_prefix_tensors("cond_stage_model.") && !model_loader.has_prefix_tensors("text_encoders.t5xxl.");
+        auto cc_vae    = model_loader.has_prefix_tensors("first_stage_model.") && !model_loader.has_prefix_tensors("vae.");
+
         if (version == VERSION_SVD) {
-            clip_vision = std::make_shared<FrozenCLIPVisionEmbedder>(backend, model_loader.tensor_storages_types);
+            clip_vision = std::make_shared<FrozenCLIPVisionEmbedder>(backend, model_loader.tensor_storages_types, cc_clip_l);
             clip_vision->alloc_params_buffer();
             clip_vision->get_param_tensors(tensors);
 
             diffusion_model = std::make_shared<UNetModel>(backend, model_loader.tensor_storages_types, version);
             diffusion_model->alloc_params_buffer();
             diffusion_model->get_param_tensors(tensors);
 
-            first_stage_model = std::make_shared<AutoEncoderKL>(backend, model_loader.tensor_storages_types, "first_stage_model", vae_decode_only, true, version);
+            first_stage_model = std::make_shared<AutoEncoderKL>(backend, model_loader.tensor_storages_types, vae_decode_only, true, version, cc_vae);
             LOG_DEBUG("vae_decode_only %d", vae_decode_only);
             first_stage_model->alloc_params_buffer();
-            first_stage_model->get_param_tensors(tensors, "first_stage_model");
+            first_stage_model->get_param_tensors(tensors);
         } else {
             clip_backend = backend;
             if (clip_on_cpu && !ggml_backend_is_cpu(backend)) {
@@ -457,16 +462,16 @@ class StableDiffusionGGML {
                 if (diffusion_flash_attn) {
                     LOG_WARN("flash attention in this diffusion model is currently unsupported!");
                 }
-                cond_stage_model = std::make_shared<SD3CLIPEmbedder>(clip_backend, model_loader.tensor_storages_types);
+                cond_stage_model = std::make_shared<SD3CLIPEmbedder>(clip_backend, model_loader.tensor_storages_types, -1, cc_clip_l, cc_clip_g, cc_t5xxl);
                 diffusion_model  = std::make_shared<MMDiTModel>(backend, model_loader.tensor_storages_types);
             } else if (sd_version_is_flux(version)) {
-                cond_stage_model = std::make_shared<FluxCLIPEmbedder>(clip_backend, model_loader.tensor_storages_types);
+                cond_stage_model = std::make_shared<FluxCLIPEmbedder>(clip_backend, model_loader.tensor_storages_types, -1, cc_clip_l, cc_t5xxl);
                 diffusion_model  = std::make_shared<FluxModel>(backend, model_loader.tensor_storages_types, version, diffusion_flash_attn);
             } else {
                 if (id_embeddings_path.find("v2") != std::string::npos) {
-                    cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend, model_loader.tensor_storages_types, embeddings_path, version, PM_VERSION_2);
+                    cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend, model_loader.tensor_storages_types, embeddings_path, version, PM_VERSION_2, -1, cc_clip_l, cc_clip_g);
                 } else {
-                    cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend, model_loader.tensor_storages_types, embeddings_path, version);
+                    cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend, model_loader.tensor_storages_types, embeddings_path, version, PM_VERSION_1, -1, cc_clip_l, cc_clip_g);
                 }
                 diffusion_model = std::make_shared<UNetModel>(backend, model_loader.tensor_storages_types, version, diffusion_flash_attn);
             }
@@ -484,12 +489,12 @@ class StableDiffusionGGML {
                 } else {
                     vae_backend = backend;
                 }
-                first_stage_model = std::make_shared<AutoEncoderKL>(vae_backend, model_loader.tensor_storages_types, "first_stage_model", vae_decode_only, false, version);
+                first_stage_model = std::make_shared<AutoEncoderKL>(vae_backend, model_loader.tensor_storages_types, vae_decode_only, false, version, cc_vae);
                 first_stage_model->alloc_params_buffer();
-                first_stage_model->get_param_tensors(tensors, "first_stage_model");
+                first_stage_model->get_param_tensors(tensors);
             }
             if (use_tiny_autoencoder) {
-                tae_first_stage = std::make_shared<TinyAutoEncoder>(backend, model_loader.tensor_storages_types, "decoder.layers", vae_decode_only, version);
+                tae_first_stage = std::make_shared<TinyAutoEncoder>(backend, model_loader.tensor_storages_types, vae_decode_only, version, cc_vae);
             }
             // first_stage_model->get_param_tensors(tensors, "first_stage_model.");
 
diff --git a/tae.hpp b/tae.hpp
@@ -192,18 +192,24 @@ class TAESD : public GGMLBlock {
 };
 
 struct TinyAutoEncoder : public GGMLRunner {
+    bool cc_vae;
     TAESD taesd;
     bool decode_only = false;
 
     TinyAutoEncoder(ggml_backend_t backend,
                     std::map<std::string, enum ggml_type>& tensor_types,
-                    const std::string prefix,
                     bool decoder_only = true,
-                    SDVersion version = VERSION_SD1)
+                    SDVersion version = VERSION_SD1,
+                    bool cc_vae       = false)
         : decode_only(decoder_only),
           taesd(decode_only, version),
           GGMLRunner(backend) {
-        taesd.init(params_ctx, tensor_types, prefix);
+        this->cc_vae = cc_vae;
+        taesd.init(params_ctx, tensor_types, vae_prefix());
+    }
+
+    std::string vae_prefix() {
+        return cc_vae ? "first_stage_model" : "";
     }
 
     std::string get_desc() {
diff --git a/vae.hpp b/vae.hpp
@@ -521,25 +521,31 @@ class AutoencodingEngine : public GGMLBlock {
 };
 
 struct AutoEncoderKL : public GGMLRunner {
+    bool cc_vae;
     bool decode_only = true;
     AutoencodingEngine ae;
 
     AutoEncoderKL(ggml_backend_t backend,
                   std::map<std::string, enum ggml_type>& tensor_types,
-                  const std::string prefix,
                   bool decode_only       = false,
                   bool use_video_decoder = false,
-                  SDVersion version      = VERSION_SD1)
+                  SDVersion version      = VERSION_SD1,
+                  bool cc_vae            = false)
         : decode_only(decode_only), ae(decode_only, use_video_decoder, version), GGMLRunner(backend) {
-        ae.init(params_ctx, tensor_types, prefix);
+        this->cc_vae = cc_vae;
+        ae.init(params_ctx, tensor_types, vae_prefix());
+    }
+
+    std::string vae_prefix() {
+        return cc_vae ? "first_stage_model" : "";
     }
 
     std::string get_desc() {
         return "vae";
     }
 
-    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
-        ae.get_param_tensors(tensors, prefix);
+    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
+        ae.get_param_tensors(tensors, vae_prefix());
     }
 
     struct ggml_cgraph* build_graph(struct ggml_tensor* z, bool decode_graph) {