feat: sdxl refiner

thxCode · thxCode · commit 8f303b14f370 · 2024-11-27T14:40:19.000+08:00
Signed-off-by: thxCode &lt;thxcode0824@gmail.com&gt;
diff --git a/conditioner.hpp b/conditioner.hpp
@@ -65,7 +65,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
         : version(version), pm_version(pv), tokenizer(version == VERSION_SD2 ? 0 : 49407), embd_dir(embd_dir), wtype(wtype) {
         if (clip_skip <= 0) {
             clip_skip = 1;
-            if (version == VERSION_SD2 || version == VERSION_SDXL) {
+            if (version == VERSION_SD2 || version == VERSION_SDXL || version == VERSION_SDXL_REFINER) {
                 clip_skip = 2;
             }
         }
@@ -76,40 +76,53 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
         } else if (version == VERSION_SDXL) {
             text_model  = std::make_shared<CLIPTextModelRunner>(backend, wtype, OPENAI_CLIP_VIT_L_14, clip_skip, false);
             text_model2 = std::make_shared<CLIPTextModelRunner>(backend, wtype, OPEN_CLIP_VIT_BIGG_14, clip_skip, false);
+        } else if (version == VERSION_SDXL_REFINER) {
+            text_model2 = std::make_shared<CLIPTextModelRunner>(backend, wtype, OPEN_CLIP_VIT_BIGG_14, clip_skip, false);
         }
     }
 
     void set_clip_skip(int clip_skip) {
-        text_model->set_clip_skip(clip_skip);
-        if (version == VERSION_SDXL) {
+        if (version != VERSION_SDXL_REFINER) {
+            text_model->set_clip_skip(clip_skip);
+        }
+        if (version == VERSION_SDXL || version == VERSION_SDXL_REFINER) {
             text_model2->set_clip_skip(clip_skip);
         }
     }
 
     void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
-        text_model->get_param_tensors(tensors, "cond_stage_model.transformer.text_model");
-        if (version == VERSION_SDXL) {
+        if (version != VERSION_SDXL_REFINER) {
+            text_model->get_param_tensors(tensors, "cond_stage_model.transformer.text_model");
+        }
+        if (version == VERSION_SDXL || version == VERSION_SDXL_REFINER) {
             text_model2->get_param_tensors(tensors, "cond_stage_model.1.transformer.text_model");
         }
     }
 
     void alloc_params_buffer() {
-        text_model->alloc_params_buffer();
-        if (version == VERSION_SDXL) {
+        if (version != VERSION_SDXL_REFINER) {
+            text_model->alloc_params_buffer();
+        }
+        if (version == VERSION_SDXL || version == VERSION_SDXL_REFINER) {
             text_model2->alloc_params_buffer();
         }
     }
 
     void free_params_buffer() {
-        text_model->free_params_buffer();
-        if (version == VERSION_SDXL) {
+        if (version != VERSION_SDXL_REFINER) {
+            text_model->free_params_buffer();
+        }
+        if (version == VERSION_SDXL || version == VERSION_SDXL_REFINER) {
             text_model2->free_params_buffer();
         }
     }
 
     size_t get_params_buffer_size() {
-        size_t buffer_size = text_model->get_params_buffer_size();
-        if (version == VERSION_SDXL) {
+        size_t buffer_size = 0;
+        if (version != VERSION_SDXL_REFINER) {
+            buffer_size = text_model->get_params_buffer_size();
+        }
+        if (version == VERSION_SDXL || version == VERSION_SDXL_REFINER) {
             buffer_size += text_model2->get_params_buffer_size();
         }
         return buffer_size;
@@ -132,8 +145,13 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
         params.no_alloc               = false;
         struct ggml_context* embd_ctx = ggml_init(params);
         struct ggml_tensor* embd      = NULL;
-        int64_t hidden_size           = text_model->model.hidden_size;
-        auto on_load                  = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) {
+        int64_t hidden_size           = 0;
+        if (version != VERSION_SDXL_REFINER) {
+            hidden_size = text_model->model.hidden_size;
+        } else {
+            hidden_size = text_model2->model.hidden_size;
+        }
+        auto on_load = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) {
             if (tensor_storage.ne[0] != hidden_size) {
                 LOG_DEBUG("embedding wrong hidden size, got %i, expected %i", tensor_storage.ne[0], hidden_size);
                 return false;
@@ -149,7 +167,11 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                embd->data,
                ggml_nbytes(embd));
         for (int i = 0; i < embd->ne[1]; i++) {
-            bpe_tokens.push_back(text_model->model.vocab_size + num_custom_embeddings);
+            if (version != VERSION_SDXL_REFINER) {
+                bpe_tokens.push_back(text_model->model.vocab_size + num_custom_embeddings);
+            } else {
+                bpe_tokens.push_back(text_model2->model.vocab_size + num_custom_embeddings);
+            }
             // LOG_DEBUG("new custom token: %i", text_model.vocab_size + num_custom_embeddings);
             num_custom_embeddings++;
         }
@@ -163,7 +185,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                                 int32_t image_token,
                                 bool padding = false) {
         return tokenize_with_trigger_token(text, num_input_imgs, image_token,
-                                           text_model->model.n_token, padding);
+                                           version != VERSION_SDXL_REFINER ? text_model->model.n_token : text_model2->model.n_token, padding);
     }
 
     std::vector<int> convert_token_to_id(std::string text) {
@@ -312,7 +334,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
 
     std::pair<std::vector<int>, std::vector<float>> tokenize(std::string text,
                                                              bool padding = false) {
-        return tokenize(text, text_model->model.n_token, padding);
+        return tokenize(text, version != VERSION_SDXL_REFINER ? text_model->model.n_token : text_model2->model.n_token, padding);
     }
 
     std::pair<std::vector<int>, std::vector<float>> tokenize(std::string text,
@@ -403,7 +425,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
             auto input_ids                 = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens);
             struct ggml_tensor* input_ids2 = NULL;
             size_t max_token_idx           = 0;
-            if (version == VERSION_SDXL) {
+            if (version == VERSION_SDXL || version == VERSION_SDXL_REFINER) {
                 auto it = std::find(chunk_tokens.begin(), chunk_tokens.end(), tokenizer.EOS_TOKEN_ID);
                 if (it != chunk_tokens.end()) {
                     std::fill(std::next(it), chunk_tokens.end(), 0);
@@ -428,16 +450,20 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                                     false,
                                     &chunk_hidden_states1,
                                     work_ctx);
-                if (version == VERSION_SDXL) {
+                if (version == VERSION_SDXL || version == VERSION_SDXL_REFINER) {
                     text_model2->compute(n_threads,
                                          input_ids2,
                                          0,
                                          NULL,
                                          max_token_idx,
                                          false,
                                          &chunk_hidden_states2, work_ctx);
-                    // concat
-                    chunk_hidden_states = ggml_tensor_concat(work_ctx, chunk_hidden_states1, chunk_hidden_states2, 0);
+                    if (version == VERSION_SDXL) {
+                        // concat
+                        chunk_hidden_states = ggml_tensor_concat(work_ctx, chunk_hidden_states1, chunk_hidden_states2, 0);
+                    } else {
+                        chunk_hidden_states = chunk_hidden_states2;
+                    }
 
                     if (chunk_idx == 0) {
                         text_model2->compute(n_threads,
@@ -487,7 +513,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                                         ggml_nelements(hidden_states) / chunk_hidden_states->ne[0]);
 
         ggml_tensor* vec = NULL;
-        if (version == VERSION_SDXL) {
+        if (version == VERSION_SDXL || version == VERSION_SDXL_REFINER) {
             int out_dim = 256;
             vec         = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, adm_in_channels);
             // [0:1280]
diff --git a/control.hpp b/control.hpp
@@ -23,14 +23,14 @@ class ControlNetBlock : public GGMLBlock {
     std::vector<int> attention_resolutions = {4, 2, 1};
     std::vector<int> channel_mult          = {1, 2, 4, 4};
     std::vector<int> transformer_depth     = {1, 1, 1, 1};
-    int time_embed_dim                     = 1280;  // model_channels*4
+    int time_embed_dim                     = 1280;  // model_channels*4, 1536 for VERSION_SDXL_REFINER
     int num_heads                          = 8;
     int num_head_channels                  = -1;   // channels // num_heads
-    int context_dim                        = 768;  // 1024 for VERSION_SD2, 2048 for VERSION_SDXL
+    int context_dim                        = 768;  // 1024 for VERSION_SD2, 2048 for VERSION_SDXL, 1280 for VERSION_SDXL_REFINER
 
 public:
-    int model_channels  = 320;
-    int adm_in_channels = 2816;  // only for VERSION_SDXL
+    int model_channels  = 320;   // 384 for VERSION_SDXL_REFINER
+    int adm_in_channels = 2816;  // 2816 for VERSION_SDXL/SVD, 2560 for VERSION_SDXL_REFINER
 
     ControlNetBlock(SDVersion version = VERSION_SD1)
         : version(version) {
@@ -45,6 +45,16 @@ class ControlNetBlock : public GGMLBlock {
             transformer_depth     = {1, 2, 10};
             num_head_channels     = 64;
             num_heads             = -1;
+        } else if (version == VERSION_SDXL_REFINER) {
+            time_embed_dim        = 1536;
+            context_dim           = 1280;
+            model_channels        = 384;
+            adm_in_channels       = 2560;
+            attention_resolutions = {4, 2};
+            channel_mult          = {1, 2, 4, 4};
+            transformer_depth     = {4, 4, 4, 4};
+            num_head_channels     = 64;
+            num_heads             = -1;
         } else if (version == VERSION_SVD) {
             in_channels       = 8;
             out_channels      = 4;
@@ -58,7 +68,7 @@ class ControlNetBlock : public GGMLBlock {
         // time_embed_1 is nn.SiLU()
         blocks["time_embed.2"] = std::shared_ptr<GGMLBlock>(new Linear(time_embed_dim, time_embed_dim));
 
-        if (version == VERSION_SDXL || version == VERSION_SVD) {
+        if (version == VERSION_SDXL || version == VERSION_SDXL_REFINER || version == VERSION_SVD) {
             blocks["label_emb.0.0"] = std::shared_ptr<GGMLBlock>(new Linear(adm_in_channels, time_embed_dim));
             // label_emb_1 is nn.SiLU()
             blocks["label_emb.0.2"] = std::shared_ptr<GGMLBlock>(new Linear(time_embed_dim, time_embed_dim));
diff --git a/denoiser.hpp b/denoiser.hpp
@@ -176,6 +176,7 @@ struct AYSSchedule : SigmaSchedule {
                 inputs = noise_levels[0];
                 break;
             case VERSION_SDXL:
+            case VERSION_SDXL_REFINER:
                 LOG_INFO("AYS using SDXL noise levels");
                 inputs = noise_levels[1];
                 break;
diff --git a/model.cpp b/model.cpp
@@ -1463,10 +1463,12 @@ bool ModelLoader::init_from_ckpt_file(const std::string& file_path, const std::s
 
 SDVersion ModelLoader::get_sd_version() {
     TensorStorage token_embedding_weight;
-    bool is_flux    = false;
-    bool is_schnell = true;
-    bool is_lite    = true;
-    bool is_sd3     = false;
+    bool is_flux      = false;
+    bool is_schnell   = true;
+    bool is_lite      = true;
+    bool is_sdxl      = false;
+    bool is_sdxl_base = false;
+    bool is_sd3       = false;
     for (auto& tensor_storage : tensor_storages) {
         if (tensor_storage.name.find("model.diffusion_model.guidance_in.in_layer.weight") != std::string::npos) {
             is_schnell = false;
@@ -1486,11 +1488,15 @@ SDVersion ModelLoader::get_sd_version() {
         if (tensor_storage.name.find("model.diffusion_model.joint_blocks.23.") != std::string::npos) {
             is_sd3 = true;
         }
-        if (tensor_storage.name.find("conditioner.embedders.1") != std::string::npos) {
-            return VERSION_SDXL;
+        if (tensor_storage.name == "conditioner.embedders.0.model.token_embedding.weight" ||
+            tensor_storage.name == "cond_stage_model.1.transformer.text_model.embeddings.token_embedding.weight") {
+            if (tensor_storage.ne[0] == 1280) {
+                is_sdxl = true;
+            }
         }
-        if (tensor_storage.name.find("cond_stage_model.1") != std::string::npos) {
-            return VERSION_SDXL;
+        if ((tensor_storage.name == "conditioner.embedders.1.model.token_embedding.weight" && tensor_storage.ne[0] == 1280) ||
+            (tensor_storage.name == "cond_stage_model.transformer.text_model.embeddings.token_embedding.weight" && tensor_storage.ne[0] == 768)) {
+            is_sdxl_base = true;
         }
         if (tensor_storage.name.find("model.diffusion_model.input_blocks.8.0.time_mixer.mix_factor") != std::string::npos) {
             return VERSION_SVD;
@@ -1519,6 +1525,12 @@ SDVersion ModelLoader::get_sd_version() {
     if (is_sd3) {
         return VERSION_SD3_2B;
     }
+    if (is_sdxl && !is_sdxl_base) {
+        return VERSION_SDXL_REFINER;
+    }
+    if (is_sdxl) {
+        return VERSION_SDXL;
+    }
     if (token_embedding_weight.ne[0] == 768) {
         return VERSION_SD1;
     } else if (token_embedding_weight.ne[0] == 1024) {
diff --git a/model.h b/model.h
@@ -21,6 +21,7 @@ enum SDVersion {
     VERSION_SD1,
     VERSION_SD2,
     VERSION_SDXL,
+    VERSION_SDXL_REFINER,
     VERSION_SVD,
     VERSION_SD3_2B,
     VERSION_FLUX_DEV,
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
@@ -28,6 +28,7 @@ const char* model_version_to_str[] = {
     "SD 1.x",
     "SD 2.x",
     "SDXL",
+    "SDXL Refiner",
     "SVD",
     "SD3 2B",
     "Flux Dev",
@@ -328,7 +329,7 @@ class StableDiffusionGGML {
             vae_wtype             = wtype;
         }
 
-        if (version == VERSION_SDXL) {
+        if (version == VERSION_SDXL || version == VERSION_SDXL_REFINER) {
             vae_wtype = GGML_TYPE_F32;
         }
 
@@ -339,7 +340,7 @@ class StableDiffusionGGML {
 
         LOG_DEBUG("ggml tensor size = %d bytes", (int)sizeof(ggml_tensor));
 
-        if (version == VERSION_SDXL) {
+        if (version == VERSION_SDXL || version == VERSION_SDXL_REFINER) {
             scale_factor = 0.13025f;
             if (vae_path.size() == 0 && taesd_path.size() == 0) {
                 LOG_WARN(
@@ -1378,7 +1379,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
     SDCondition uncond;
     if (cfg_scale != 1.0) {
         bool force_zero_embeddings = false;
-        if (sd_ctx->sd->version == VERSION_SDXL && negative_prompt.size() == 0) {
+        if ((sd_ctx->sd->version == VERSION_SDXL || sd_ctx->sd->version == VERSION_SDXL_REFINER) && negative_prompt.size() == 0) {
             force_zero_embeddings = true;
         }
         uncond = sd_ctx->sd->cond_stage_model->get_learned_condition(work_ctx,
diff --git a/unet.hpp b/unet.hpp
@@ -174,14 +174,14 @@ class UnetModelBlock : public GGMLBlock {
     std::vector<int> attention_resolutions = {4, 2, 1};
     std::vector<int> channel_mult          = {1, 2, 4, 4};
     std::vector<int> transformer_depth     = {1, 1, 1, 1};
-    int time_embed_dim                     = 1280;  // model_channels*4
+    int time_embed_dim                     = 1280;  // model_channels*4, 1536 for VERSION_SDXL_REFINER
     int num_heads                          = 8;
     int num_head_channels                  = -1;   // channels // num_heads
-    int context_dim                        = 768;  // 1024 for VERSION_SD2, 2048 for VERSION_SDXL
+    int context_dim                        = 768;  // 1024 for VERSION_SD2, 2048 for VERSION_SDXL, 1280 for VERSION_SDXL_REFINER
 
 public:
-    int model_channels  = 320;
-    int adm_in_channels = 2816;  // only for VERSION_SDXL/SVD
+    int model_channels  = 320;   // 384 for VERSION_SDXL_REFINER
+    int adm_in_channels = 2816;  // 2816 for VERSION_SDXL/SVD, 2560 for VERSION_SDXL_REFINER
 
     UnetModelBlock(SDVersion version = VERSION_SD1, bool flash_attn = false)
         : version(version) {
@@ -196,6 +196,16 @@ class UnetModelBlock : public GGMLBlock {
             transformer_depth     = {1, 2, 10};
             num_head_channels     = 64;
             num_heads             = -1;
+        } else if (version == VERSION_SDXL_REFINER) {
+            time_embed_dim        = 1536;
+            context_dim           = 1280;
+            model_channels        = 384;
+            adm_in_channels       = 2560;
+            attention_resolutions = {4, 2};
+            channel_mult          = {1, 2, 4, 4};
+            transformer_depth     = {4, 4, 4, 4};
+            num_head_channels     = 64;
+            num_heads             = -1;
         } else if (version == VERSION_SVD) {
             in_channels       = 8;
             out_channels      = 4;
@@ -211,7 +221,7 @@ class UnetModelBlock : public GGMLBlock {
         // time_embed_1 is nn.SiLU()
         blocks["time_embed.2"] = std::shared_ptr<GGMLBlock>(new Linear(time_embed_dim, time_embed_dim));
 
-        if (version == VERSION_SDXL || version == VERSION_SVD) {
+        if (version == VERSION_SDXL || version == VERSION_SDXL_REFINER || version == VERSION_SVD) {
             blocks["label_emb.0.0"] = std::shared_ptr<GGMLBlock>(new Linear(adm_in_channels, time_embed_dim));
             // label_emb_1 is nn.SiLU()
             blocks["label_emb.0.2"] = std::shared_ptr<GGMLBlock>(new Linear(time_embed_dim, time_embed_dim));