feat(tx): sdxl refiner

thxCode · thxCode · commit 7d9176a8b047 · 2024-12-29T17:00:13.000+08:00
Signed-off-by: thxCode &lt;thxcode0824@gmail.com&gt;
diff --git a/conditioner.hpp b/conditioner.hpp
@@ -73,42 +73,52 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
         } else if (sd_version_is_sd2(version)) {
             text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14, clip_skip);
         } else if (sd_version_is_sdxl(version)) {
-            text_model  = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, clip_skip, false);
+            if (version != VERSION_SDXL_REFINER) {
+                text_model  = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, clip_skip, false);
+            }
             text_model2 = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, clip_skip, false);
         }
     }
 
     void set_clip_skip(int clip_skip) {
-        text_model->set_clip_skip(clip_skip);
-        if (sd_version_is_sdxl(version)) {
+        if (text_model) {
+            text_model->set_clip_skip(clip_skip);
+        }
+        if (text_model2) {
             text_model2->set_clip_skip(clip_skip);
         }
     }
 
     void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
-        text_model->get_param_tensors(tensors, "cond_stage_model.transformer.text_model");
-        if (sd_version_is_sdxl(version)) {
+        if (text_model) {
+            text_model->get_param_tensors(tensors, "cond_stage_model.transformer.text_model");
+        }
+        if (text_model2) {
             text_model2->get_param_tensors(tensors, "cond_stage_model.1.transformer.text_model");
         }
     }
 
     void alloc_params_buffer() {
-        text_model->alloc_params_buffer();
-        if (sd_version_is_sdxl(version)) {
+        if (text_model) {
+            text_model->alloc_params_buffer();
+        }
+        if (text_model2) {
             text_model2->alloc_params_buffer();
         }
     }
 
     void free_params_buffer() {
-        text_model->free_params_buffer();
-        if (sd_version_is_sdxl(version)) {
+        if (text_model) {
+            text_model->free_params_buffer();
+        }
+        if (text_model2) {
             text_model2->free_params_buffer();
         }
     }
 
     size_t get_params_buffer_size() {
-        size_t buffer_size = text_model->get_params_buffer_size();
-        if (sd_version_is_sdxl(version)) {
+        size_t buffer_size = text_model ? text_model->get_params_buffer_size() : 0;
+        if (text_model2) {
             buffer_size += text_model2->get_params_buffer_size();
         }
         return buffer_size;
@@ -131,7 +141,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
         params.no_alloc               = false;
         struct ggml_context* embd_ctx = ggml_init(params);
         struct ggml_tensor* embd      = NULL;
-        int64_t hidden_size           = text_model->model.hidden_size;
+        int64_t hidden_size           = text_model ? text_model->model.hidden_size : text_model2->model.hidden_size;
         auto on_load                  = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) {
             if (tensor_storage.ne[0] != hidden_size) {
                 LOG_DEBUG("embedding wrong hidden size, got %i, expected %i", tensor_storage.ne[0], hidden_size);
@@ -148,7 +158,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                embd->data,
                ggml_nbytes(embd));
         for (int i = 0; i < embd->ne[1]; i++) {
-            bpe_tokens.push_back(text_model->model.vocab_size + num_custom_embeddings);
+            bpe_tokens.push_back((text_model ? text_model->model.vocab_size : text_model2->model.vocab_size) + num_custom_embeddings);
             // LOG_DEBUG("new custom token: %i", text_model.vocab_size + num_custom_embeddings);
             num_custom_embeddings++;
         }
@@ -162,7 +172,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                                 int32_t image_token,
                                 bool padding = false) {
         return tokenize_with_trigger_token(text, num_input_imgs, image_token,
-                                           text_model->model.n_token, padding);
+                                           text_model ? text_model->model.n_token : text_model2->model.n_token, padding);
     }
 
     std::vector<int> convert_token_to_id(std::string text) {
@@ -311,7 +321,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
 
     std::pair<std::vector<int>, std::vector<float>> tokenize(std::string text,
                                                              bool padding = false) {
-        return tokenize(text, text_model->model.n_token, padding);
+        return tokenize(text, text_model ? text_model->model.n_token : text_model2->model.n_token, padding);
     }
 
     std::pair<std::vector<int>, std::vector<float>> tokenize(std::string text,
@@ -419,28 +429,31 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
             }
 
             {
-                text_model->compute(n_threads,
-                                    input_ids,
-                                    num_custom_embeddings,
-                                    token_embed_custom.data(),
-                                    max_token_idx,
-                                    false,
-                                    &chunk_hidden_states1,
-                                    work_ctx);
-                if (sd_version_is_sdxl(version)) {
+                if (text_model) {
+                    text_model->compute(n_threads,
+                                        input_ids,
+                                        num_custom_embeddings,
+                                        token_embed_custom.data(),
+                                        max_token_idx,
+                                        false,
+                                        &chunk_hidden_states1,
+                                        work_ctx);
+                }
+                if (text_model2) {
                     text_model2->compute(n_threads,
-                                         input_ids2,
-                                         0,
-                                         NULL,
+                                         text_model ? input_ids2 : input_ids,
+                                         text_model ? 0 : num_custom_embeddings,
+                                         text_model ? NULL : token_embed_custom.data(),
                                          max_token_idx,
                                          false,
-                                         &chunk_hidden_states2, work_ctx);
+                                         text_model ? &chunk_hidden_states2 : &chunk_hidden_states1,
+                                         work_ctx);
                     // concat
-                    chunk_hidden_states = ggml_tensor_concat(work_ctx, chunk_hidden_states1, chunk_hidden_states2, 0);
+                    chunk_hidden_states = text_model ? ggml_tensor_concat(work_ctx, chunk_hidden_states1, chunk_hidden_states2, 0) : chunk_hidden_states1;
 
                     if (chunk_idx == 0) {
                         text_model2->compute(n_threads,
-                                             input_ids2,
+                                             text_model ? input_ids2 : input_ids,
                                              0,
                                              NULL,
                                              max_token_idx,
@@ -486,7 +499,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                                         ggml_nelements(hidden_states) / chunk_hidden_states->ne[0]);
 
         ggml_tensor* vec = NULL;
-        if (sd_version_is_sdxl(version)) {
+        if (sd_version_is_sdxl(version) && version != VERSION_SDXL_REFINER) {
             int out_dim = 256;
             vec         = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, adm_in_channels);
             // [0:1280]
diff --git a/control.hpp b/control.hpp
@@ -23,14 +23,14 @@ class ControlNetBlock : public GGMLBlock {
     std::vector<int> attention_resolutions = {4, 2, 1};
     std::vector<int> channel_mult          = {1, 2, 4, 4};
     std::vector<int> transformer_depth     = {1, 1, 1, 1};
-    int time_embed_dim                     = 1280;  // model_channels*4
+    int time_embed_dim                     = 1280;  // model_channels*4, 1536 for VERSION_SDXL_REFINER
     int num_heads                          = 8;
     int num_head_channels                  = -1;   // channels // num_heads
-    int context_dim                        = 768;  // 1024 for VERSION_SD2, 2048 for VERSION_SDXL
+    int context_dim                        = 768;  // 1024 for VERSION_SD2, 2048 for VERSION_SDXL, 1280 for VERSION_SDXL_REFINER
 
 public:
-    int model_channels  = 320;
-    int adm_in_channels = 2816;  // only for VERSION_SDXL
+    int model_channels  = 320;   // 384 for VERSION_SDXL_REFINER
+    int adm_in_channels = 2816;  // 2816 for VERSION_SDXL/SVD, 2560 for VERSION_SDXL_REFINER
 
     ControlNetBlock(SDVersion version = VERSION_SD1)
         : version(version) {
@@ -45,6 +45,17 @@ class ControlNetBlock : public GGMLBlock {
             transformer_depth     = {1, 2, 10};
             num_head_channels     = 64;
             num_heads             = -1;
+            if (version == VERSION_SDXL_REFINER) {
+                time_embed_dim        = 1536;
+                context_dim           = 1280;
+                model_channels        = 384;
+                adm_in_channels       = 2560;
+                attention_resolutions = {4, 2};
+                channel_mult          = {1, 2, 4, 4};
+                transformer_depth     = {4, 4, 4, 4};
+                num_head_channels     = 64;
+                num_heads             = -1;
+            }
         } else if (version == VERSION_SVD) {
             in_channels       = 8;
             out_channels      = 4;
diff --git a/denoiser.hpp b/denoiser.hpp
@@ -170,12 +170,17 @@ struct AYSSchedule : SigmaSchedule {
 
         switch (version) {
             case VERSION_SD2: /* fallthrough */
+            case VERSION_SD2_INPAINT:
                 LOG_WARN("AYS not designed for SD2.X models");
+                return results;
             case VERSION_SD1:
+            case VERSION_SD1_INPAINT:
                 LOG_INFO("AYS using SD1.5 noise levels");
                 inputs = noise_levels[0];
                 break;
             case VERSION_SDXL:
+            case VERSION_SDXL_REFINER:
+            case VERSION_SDXL_INPAINT:
                 LOG_INFO("AYS using SDXL noise levels");
                 inputs = noise_levels[1];
                 break;
diff --git a/model.cpp b/model.cpp
@@ -1474,6 +1474,7 @@ SDVersion ModelLoader::get_sd_version() {
 
     bool is_xl = false;
     bool is_flux = false;
+    bool is_refiner = false;
 
 #define found_family (is_xl || is_flux)
     for (auto& tensor_storage : tensor_storages) {
@@ -1505,6 +1506,9 @@ SDVersion ModelLoader::get_sd_version() {
                     }
                 }
             }
+            if (tensor_storage.name.find("model.diffusion_model.output_blocks.11.0.") != std::string::npos) {
+                is_refiner = true;
+            }
             if (tensor_storage.name.find("model.diffusion_model.input_blocks.8.0.time_mixer.mix_factor") != std::string::npos) {
                 return VERSION_SVD;
             }
@@ -1528,6 +1532,9 @@ SDVersion ModelLoader::get_sd_version() {
     }
     bool is_inpaint = input_block_weight.ne[2] == 9;
     if (is_xl) {
+        if (is_refiner) {
+            return VERSION_SDXL_REFINER;
+        }
         if (is_inpaint) {
             return VERSION_SDXL_INPAINT;
         }
diff --git a/model.h b/model.h
@@ -23,6 +23,7 @@ enum SDVersion {
     VERSION_SD2,
     VERSION_SD2_INPAINT,
     VERSION_SDXL,
+    VERSION_SDXL_REFINER,
     VERSION_SDXL_INPAINT,
     VERSION_SVD,
     VERSION_SD3,
@@ -60,7 +61,7 @@ static inline bool sd_version_is_sd2(SDVersion version) {
 }
 
 static inline bool sd_version_is_sdxl(SDVersion version) {
-    if (version == VERSION_SDXL || version == VERSION_SDXL_INPAINT) {
+    if (version == VERSION_SDXL || version == VERSION_SDXL_REFINER || version == VERSION_SDXL_INPAINT) {
         return true;
     }
     return false;
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
@@ -32,6 +32,7 @@ const char* model_version_to_str[] = {
     "SD 2.x",
     "SD 2.x Inpaint",
     "SDXL",
+    "SDXL Refiner",
     "SDXL Inpaint",
     "SVD",
     "SD3.x",
diff --git a/unet.hpp b/unet.hpp
@@ -175,14 +175,14 @@ class UnetModelBlock : public GGMLBlock {
     std::vector<int> attention_resolutions = {4, 2, 1};
     std::vector<int> channel_mult          = {1, 2, 4, 4};
     std::vector<int> transformer_depth     = {1, 1, 1, 1};
-    int time_embed_dim                     = 1280;  // model_channels*4
+    int time_embed_dim                     = 1280;  // model_channels*4, 1536 for VERSION_SDXL_REFINER
     int num_heads                          = 8;
     int num_head_channels                  = -1;   // channels // num_heads
-    int context_dim                        = 768;  // 1024 for VERSION_SD2, 2048 for VERSION_SDXL
+    int context_dim                        = 768;  // 1024 for VERSION_SD2, 2048 for VERSION_SDXL, 1280 for VERSION_SDXL_REFINER
 
 public:
-    int model_channels  = 320;
-    int adm_in_channels = 2816;  // only for VERSION_SDXL/SVD
+    int model_channels  = 320;   // 384 for VERSION_SDXL_REFINER
+    int adm_in_channels = 2816;  // 2816 for VERSION_SDXL/SVD, 2560 for VERSION_SDXL_REFINER
 
     UnetModelBlock(SDVersion version = VERSION_SD1, std::map<std::string, enum ggml_type>& tensor_types = empty_tensor_types, bool flash_attn = false)
         : version(version) {
@@ -197,6 +197,17 @@ class UnetModelBlock : public GGMLBlock {
             transformer_depth     = {1, 2, 10};
             num_head_channels     = 64;
             num_heads             = -1;
+            if (version == VERSION_SDXL_REFINER) {
+                time_embed_dim        = 1536;
+                context_dim           = 1280;
+                model_channels        = 384;
+                adm_in_channels       = 2560;
+                attention_resolutions = {4, 2};
+                channel_mult          = {1, 2, 4, 4};
+                transformer_depth     = {4, 4, 4, 4};
+                num_head_channels     = 64;
+                num_heads             = -1;
+            }
         } else if (version == VERSION_SVD) {
             in_channels       = 8;
             out_channels      = 4;

Original file line number	Diff line number	Diff line change
`@@ -1474,6 +1474,7 @@ SDVersion ModelLoader::get_sd_version() {`
`1474`	`1474`
`1475`	`1475`	`bool is_xl = false;`
`1476`	`1476`	`bool is_flux = false;`
	`1477`	`+ bool is_refiner = false;`
`1477`	`1478`
`1478`	`1479`	`#define found_family (is_xl \|\| is_flux)`
`1479`	`1480`	`for (auto& tensor_storage : tensor_storages) {`
`@@ -1505,6 +1506,9 @@ SDVersion ModelLoader::get_sd_version() {`
`1505`	`1506`	`}`
`1506`	`1507`	`}`
`1507`	`1508`	`}`
	`1509`	`+ if (tensor_storage.name.find("model.diffusion_model.output_blocks.11.0.") != std::string::npos) {`
	`1510`	`+ is_refiner = true;`
	`1511`	`+ }`
`1508`	`1512`	`if (tensor_storage.name.find("model.diffusion_model.input_blocks.8.0.time_mixer.mix_factor") != std::string::npos) {`
`1509`	`1513`	`return VERSION_SVD;`
`1510`	`1514`	`}`
`@@ -1528,6 +1532,9 @@ SDVersion ModelLoader::get_sd_version() {`
`1528`	`1532`	`}`
`1529`	`1533`	`bool is_inpaint = input_block_weight.ne[2] == 9;`
`1530`	`1534`	`if (is_xl) {`
	`1535`	`+ if (is_refiner) {`
	`1536`	`+ return VERSION_SDXL_REFINER;`
	`1537`	`+ }`
`1531`	`1538`	`if (is_inpaint) {`
`1532`	`1539`	`return VERSION_SDXL_INPAINT;`
`1533`	`1540`	`}`