refactor(tx): speed up q4_0 loading

thxCode · thxCode · commit 78629d6340f7 · 2024-11-29T17:07:23.000+08:00
Signed-off-by: thxCode &lt;thxcode0824@gmail.com&gt;
diff --git a/conditioner.hpp b/conditioner.hpp
@@ -48,7 +48,6 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
     SDVersion version    = VERSION_SD1;
     PMVersion pm_version = PM_VERSION_1;
     CLIPTokenizer tokenizer;
-    ggml_type wtype;
     std::shared_ptr<CLIPTextModelRunner> text_model;
     std::shared_ptr<CLIPTextModelRunner> text_model2;
 
@@ -59,7 +58,8 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
     std::vector<std::string> readed_embeddings;
 
     FrozenCLIPEmbedderWithCustomWords(ggml_backend_t backend,
-                                      ggml_type wtype,
+                                      ggml_type clip_l_wtype,
+                                      ggml_type clip_g_wtype,
                                       const std::string& embd_dir,
                                       SDVersion version                = VERSION_SD1,
                                       PMVersion pv                     = PM_VERSION_1,
@@ -70,7 +70,6 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
           pm_version(pv),
           tokenizer(version == VERSION_SD2 ? 0 : 49407),
           embd_dir(embd_dir),
-          wtype(wtype),
           compvis_compatiblity_clip_l(compvis_compatiblity_clip_l),
           compvis_compatiblity_clip_g(compvis_compatiblity_clip_g) {
         if (clip_skip <= 0) {
@@ -80,14 +79,14 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
             }
         }
         if (version == VERSION_SD1) {
-            text_model = std::make_shared<CLIPTextModelRunner>(backend, wtype, OPENAI_CLIP_VIT_L_14, clip_skip);
+            text_model = std::make_shared<CLIPTextModelRunner>(backend, clip_l_wtype, OPENAI_CLIP_VIT_L_14, clip_skip);
         } else if (version == VERSION_SD2) {
-            text_model = std::make_shared<CLIPTextModelRunner>(backend, wtype, OPEN_CLIP_VIT_H_14, clip_skip);
+            text_model = std::make_shared<CLIPTextModelRunner>(backend, clip_l_wtype, OPEN_CLIP_VIT_H_14, clip_skip);
         } else if (version == VERSION_SDXL) {
-            text_model  = std::make_shared<CLIPTextModelRunner>(backend, wtype, OPENAI_CLIP_VIT_L_14, clip_skip, false);
-            text_model2 = std::make_shared<CLIPTextModelRunner>(backend, wtype, OPEN_CLIP_VIT_BIGG_14, clip_skip, false);
+            text_model  = std::make_shared<CLIPTextModelRunner>(backend, clip_l_wtype, OPENAI_CLIP_VIT_L_14, clip_skip, false);
+            text_model2 = std::make_shared<CLIPTextModelRunner>(backend, clip_g_wtype, OPEN_CLIP_VIT_BIGG_14, clip_skip, false);
         } else if (version == VERSION_SDXL_REFINER) {
-            text_model2 = std::make_shared<CLIPTextModelRunner>(backend, wtype, OPEN_CLIP_VIT_BIGG_14, clip_skip, false);
+            text_model2 = std::make_shared<CLIPTextModelRunner>(backend, clip_g_wtype, OPEN_CLIP_VIT_BIGG_14, clip_skip, false);
         }
     }
 
@@ -174,14 +173,14 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                 LOG_DEBUG("embedding wrong hidden size, got %i, expected %i", tensor_storage.ne[0], hidden_size);
                 return false;
             }
-            embd        = ggml_new_tensor_2d(embd_ctx, wtype, hidden_size, tensor_storage.n_dims > 1 ? tensor_storage.ne[1] : 1);
+            embd        = ggml_new_tensor_2d(embd_ctx, tensor_storage.type, hidden_size, tensor_storage.n_dims > 1 ? tensor_storage.ne[1] : 1);
             *dst_tensor = embd;
             return true;
         };
         model_loader.load_tensors(on_load, NULL);
         readed_embeddings.push_back(embd_name);
         token_embed_custom.resize(token_embed_custom.size() + ggml_nbytes(embd));
-        memcpy((void*)(token_embed_custom.data() + num_custom_embeddings * hidden_size * ggml_type_size(wtype)),
+        memcpy((void*)(token_embed_custom.data() + num_custom_embeddings * hidden_size * ggml_type_size(embd->type)),
                embd->data,
                ggml_nbytes(embd));
         for (int i = 0; i < embd->ne[1]; i++) {
@@ -674,7 +673,6 @@ struct SD3CLIPEmbedder : public Conditioner {
     bool compvis_compatiblity_clip_l;
     bool compvis_compatiblity_clip_g;
     bool compvis_compatiblity_t5xxl;
-    ggml_type wtype;
     CLIPTokenizer clip_l_tokenizer;
     CLIPTokenizer clip_g_tokenizer;
     T5UniGramTokenizer t5_tokenizer;
@@ -683,22 +681,23 @@ struct SD3CLIPEmbedder : public Conditioner {
     std::shared_ptr<T5Runner> t5;
 
     SD3CLIPEmbedder(ggml_backend_t backend,
-                    ggml_type wtype,
+                    ggml_type clip_l_wtype,
+                    ggml_type clip_g_wtype,
+                    ggml_type t5xxl_wtype,
                     bool compvis_compatiblity_clip_l = false,
                     bool compvis_compatiblity_clip_g = false,
                     bool compvis_compatiblity_t5xxl  = false,
                     int clip_skip                    = -1)
-        : wtype(wtype),
-          clip_g_tokenizer(0),
+        : clip_g_tokenizer(0),
           compvis_compatiblity_clip_l(compvis_compatiblity_clip_l),
           compvis_compatiblity_clip_g(compvis_compatiblity_clip_g),
           compvis_compatiblity_t5xxl(compvis_compatiblity_t5xxl) {
         if (clip_skip <= 0) {
             clip_skip = 2;
         }
-        clip_l = std::make_shared<CLIPTextModelRunner>(backend, wtype, OPENAI_CLIP_VIT_L_14, clip_skip, false);
-        clip_g = std::make_shared<CLIPTextModelRunner>(backend, wtype, OPEN_CLIP_VIT_BIGG_14, clip_skip, false);
-        t5     = std::make_shared<T5Runner>(backend, wtype);
+        clip_l = std::make_shared<CLIPTextModelRunner>(backend, clip_l_wtype, OPENAI_CLIP_VIT_L_14, clip_skip, false);
+        clip_g = std::make_shared<CLIPTextModelRunner>(backend, clip_g_wtype, OPEN_CLIP_VIT_BIGG_14, clip_skip, false);
+        t5     = std::make_shared<T5Runner>(backend, t5xxl_wtype);
     }
 
     void set_clip_skip(int clip_skip) {
@@ -1042,25 +1041,24 @@ struct SD3CLIPEmbedder : public Conditioner {
 struct FluxCLIPEmbedder : public Conditioner {
     bool compvis_compatiblity_clip_l;
     bool compvis_compatiblity_t5xxl;
-    ggml_type wtype;
     CLIPTokenizer clip_l_tokenizer;
     T5UniGramTokenizer t5_tokenizer;
     std::shared_ptr<CLIPTextModelRunner> clip_l;
     std::shared_ptr<T5Runner> t5;
 
     FluxCLIPEmbedder(ggml_backend_t backend,
-                     ggml_type wtype,
+                     ggml_type clip_l_wtype,
+                     ggml_type t5xxl_wtype,
                      bool compvis_compatiblity_clip_l = false,
                      bool compvis_compatiblity_t5xxl  = false,
                      int clip_skip                    = -1)
-        : wtype(wtype),
-          compvis_compatiblity_clip_l(compvis_compatiblity_clip_l),
+        : compvis_compatiblity_clip_l(compvis_compatiblity_clip_l),
           compvis_compatiblity_t5xxl(compvis_compatiblity_t5xxl) {
         if (clip_skip <= 0) {
             clip_skip = 2;
         }
-        clip_l = std::make_shared<CLIPTextModelRunner>(backend, wtype, OPENAI_CLIP_VIT_L_14, clip_skip, true);
-        t5     = std::make_shared<T5Runner>(backend, wtype);
+        clip_l = std::make_shared<CLIPTextModelRunner>(backend, clip_l_wtype, OPENAI_CLIP_VIT_L_14, clip_skip, true);
+        t5     = std::make_shared<T5Runner>(backend, t5xxl_wtype);
     }
 
     void set_clip_skip(int clip_skip) {
diff --git a/model.cpp b/model.cpp
@@ -1691,7 +1691,7 @@ ggml_type ModelLoader::get_sd_wtype() {
     return get_diffusion_model_wtype();
 }
 
-ggml_type ModelLoader::get_conditioner_wtype() {
+ggml_type ModelLoader::get_conditioner_wtype(std::vector<std::string> prefixes) {
     for (auto& tensor_storage : tensor_storages) {
         if (is_unused_tensor(tensor_storage.name)) {
             continue;
@@ -1704,6 +1704,16 @@ ggml_type ModelLoader::get_conditioner_wtype() {
             continue;
         }
 
+        bool goahead = true;
+        if (!prefixes.empty()) {
+            goahead = std::any_of(prefixes.begin(), prefixes.end(), [&](const std::string& prefix) {
+                return tensor_storage.name.find(prefix) != std::string::npos;
+            });
+        }
+        if (!goahead) {
+            continue;
+        }
+
         if (ggml_is_quantized(tensor_storage.type)) {
             return tensor_storage.type;
         }
diff --git a/model.h b/model.h
@@ -183,7 +183,7 @@ class ModelLoader {
     bool init_from_safetensors_file(const std::string& dir_path, const std::string& file_prefix, ggml_type type, const std::string& prefix = "");
     SDVersion get_sd_version();
     ggml_type get_sd_wtype();
-    ggml_type get_conditioner_wtype();
+    ggml_type get_conditioner_wtype(std::vector<std::string> prefixes = {});
     ggml_type get_diffusion_model_wtype();
     ggml_type get_vae_wtype();
     bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend_t backend);
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
@@ -116,7 +116,9 @@ class StableDiffusionGGML {
     ggml_backend_t control_net_backend = NULL;
     ggml_backend_t vae_backend         = NULL;
     ggml_type model_wtype              = GGML_TYPE_COUNT;
-    ggml_type conditioner_wtype        = GGML_TYPE_COUNT;
+    ggml_type clip_l_wtype             = GGML_TYPE_COUNT;
+    ggml_type clip_g_wtype             = GGML_TYPE_COUNT;
+    ggml_type t5xxl_wtype              = GGML_TYPE_COUNT;
     ggml_type diffusion_model_wtype    = GGML_TYPE_COUNT;
     ggml_type vae_wtype                = GGML_TYPE_COUNT;
 
@@ -305,32 +307,79 @@ class StableDiffusionGGML {
                 model_wtype = GGML_TYPE_F32;
                 LOG_WARN("can not get mode wtype frome weight, use f32");
             }
-            conditioner_wtype = model_loader.get_conditioner_wtype();
-            if (conditioner_wtype == GGML_TYPE_COUNT) {
-                conditioner_wtype = wtype;
+            switch (version) {
+                case VERSION_SVD:
+                case VERSION_SD1:
+                case VERSION_SD2:
+                case VERSION_SDXL:
+                case VERSION_SDXL_REFINER: {
+                    if (version != VERSION_SDXL_REFINER) {
+                        clip_l_wtype = model_loader.get_conditioner_wtype({"cond_stage_model.transformer.", "text_encoders.clip_l."});
+                        if (clip_l_wtype == GGML_TYPE_COUNT) {
+                            clip_l_wtype = wtype;
+                        }
+                    }
+                    if (version == VERSION_SDXL_REFINER || version == VERSION_SDXL) {
+                        clip_g_wtype = model_loader.get_conditioner_wtype({"cond_stage_model.1.", "text_encoders.clip_g."});
+                        if (clip_g_wtype == GGML_TYPE_COUNT) {
+                            clip_g_wtype = wtype;
+                        }
+                    }
+                    break;
+                }
+                case VERSION_SD3_2B:
+                case VERSION_SD3_5_2B:
+                case VERSION_SD3_5_8B: {
+                    clip_l_wtype = model_loader.get_conditioner_wtype({"cond_stage_model.transformer.", "text_encoders.clip_l."});
+                    if (clip_l_wtype == GGML_TYPE_COUNT) {
+                        clip_l_wtype = wtype;
+                    }
+                    clip_g_wtype = model_loader.get_conditioner_wtype({"cond_stage_model.1.", "text_encoders.clip_g."});
+                    if (clip_g_wtype == GGML_TYPE_COUNT) {
+                        clip_g_wtype = wtype;
+                    }
+                    t5xxl_wtype = model_loader.get_conditioner_wtype({"cond_stage_model.2.", "text_encoders.t5xxl."});
+                    if (t5xxl_wtype == GGML_TYPE_COUNT) {
+                        t5xxl_wtype = wtype;
+                    }
+                    break;
+                }
+                case VERSION_FLUX_LITE:
+                case VERSION_FLUX_DEV:
+                case VERSION_FLUX_SCHNELL: {
+                    clip_l_wtype = model_loader.get_conditioner_wtype({"cond_stage_model.transformer.", "text_encoders.clip_l."});
+                    if (clip_l_wtype == GGML_TYPE_COUNT) {
+                        clip_l_wtype = wtype;
+                    }
+                    t5xxl_wtype = model_loader.get_conditioner_wtype({"cond_stage_model.1.", "text_encoders.t5xxl."});
+                    if (t5xxl_wtype == GGML_TYPE_COUNT) {
+                        t5xxl_wtype = wtype;
+                    }
+                    break;
+                }
             }
+
             diffusion_model_wtype = model_loader.get_diffusion_model_wtype();
             if (diffusion_model_wtype == GGML_TYPE_COUNT) {
                 diffusion_model_wtype = wtype;
             }
             vae_wtype = model_loader.get_vae_wtype();
-
             if (vae_wtype == GGML_TYPE_COUNT) {
                 vae_wtype = wtype;
             }
         } else {
             model_wtype           = wtype;
-            conditioner_wtype     = wtype;
+            clip_l_wtype          = wtype;
+            clip_g_wtype          = wtype;
+            t5xxl_wtype           = wtype;
             diffusion_model_wtype = wtype;
             vae_wtype             = wtype;
         }
 
-        if (version == VERSION_SDXL || version == VERSION_SDXL_REFINER) {
-            vae_wtype = GGML_TYPE_F32;
-        }
-
         LOG_INFO("Weight type:                 %s", ggml_type_name(model_wtype));
-        LOG_INFO("Conditioner weight type:     %s", ggml_type_name(conditioner_wtype));
+        LOG_INFO("CLIP_L weight type:          %s", ggml_type_name(clip_l_wtype));
+        LOG_INFO("CLIP_G weight type:          %s", ggml_type_name(clip_g_wtype));
+        LOG_INFO("T5XXL weight type:           %s", ggml_type_name(t5xxl_wtype));
         LOG_INFO("Diffusion model weight type: %s", ggml_type_name(diffusion_model_wtype));
         LOG_INFO("VAE weight type:             %s", ggml_type_name(vae_wtype));
 
@@ -351,7 +400,7 @@ class StableDiffusionGGML {
         auto cc_vae    = model_loader.has_prefix_tensors("first_stage_model.") && !model_loader.has_prefix_tensors("vae.");
 
         if (version == VERSION_SVD) {
-            clip_vision = std::make_shared<FrozenCLIPVisionEmbedder>(backend, conditioner_wtype);
+            clip_vision = std::make_shared<FrozenCLIPVisionEmbedder>(backend, clip_l_wtype);
             clip_vision->alloc_params_buffer();
             clip_vision->get_param_tensors(tensors);
 
@@ -364,15 +413,7 @@ class StableDiffusionGGML {
             first_stage_model->alloc_params_buffer();
             first_stage_model->get_param_tensors(tensors);
         } else {
-            clip_backend   = backend;
-            bool use_t5xxl = false;
-            if (sd_version_is_dit(version)) {
-                use_t5xxl = true;
-            }
-            if (!ggml_backend_is_cpu(backend) && use_t5xxl && conditioner_wtype != diffusion_model_wtype) {
-                clip_on_cpu = true;
-                LOG_INFO("set clip_on_cpu to true");
-            }
+            clip_backend = backend;
             if (clip_on_cpu && !ggml_backend_is_cpu(backend)) {
                 LOG_INFO("CLIP: Using CPU backend");
                 clip_backend = ggml_backend_cpu_init();
@@ -384,16 +425,16 @@ class StableDiffusionGGML {
                 if (diffusion_flash_attn) {
                     LOG_WARN("flash attention in this diffusion model is currently unsupported!");
                 }
-                cond_stage_model = std::make_shared<SD3CLIPEmbedder>(clip_backend, conditioner_wtype, cc_clip_l, cc_clip_g, cc_t5xxl);
+                cond_stage_model = std::make_shared<SD3CLIPEmbedder>(clip_backend, clip_l_wtype, clip_g_wtype, t5xxl_wtype, cc_clip_l, cc_clip_g, cc_t5xxl);
                 diffusion_model  = std::make_shared<MMDiTModel>(backend, diffusion_model_wtype, version);
             } else if (sd_version_is_flux(version)) {
-                cond_stage_model = std::make_shared<FluxCLIPEmbedder>(clip_backend, conditioner_wtype, cc_clip_l, cc_t5xxl);
+                cond_stage_model = std::make_shared<FluxCLIPEmbedder>(clip_backend, clip_l_wtype, t5xxl_wtype, cc_clip_l, cc_t5xxl);
                 diffusion_model  = std::make_shared<FluxModel>(backend, diffusion_model_wtype, version, diffusion_flash_attn);
             } else {
                 if (id_embeddings_path.find("v2") != std::string::npos) {
-                    cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend, conditioner_wtype, embeddings_path, version, PM_VERSION_2, cc_clip_l, cc_clip_g);
+                    cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend, clip_l_wtype, clip_g_wtype, embeddings_path, version, PM_VERSION_2, cc_clip_l, cc_clip_g);
                 } else {
-                    cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend, conditioner_wtype, embeddings_path, version, PM_VERSION_1, cc_clip_l, cc_clip_g);
+                    cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend, clip_l_wtype, clip_g_wtype, embeddings_path, version, PM_VERSION_1, cc_clip_l, cc_clip_g);
                 }
                 diffusion_model = std::make_shared<UNetModel>(backend, diffusion_model_wtype, version, diffusion_flash_attn);
             }