magicse
diff --git a/‎assets/sd3.5_large.png
1.81 MB b/‎assets/sd3.5_large.png
1.81 MB
diff --git a/‎conditioner.hpp
Lines changed: 2 additions & 2 deletions b/‎conditioner.hpp
Lines changed: 2 additions & 2 deletions
diff --git a/‎denoiser.hpp
Lines changed: 36 additions & 39 deletions b/‎denoiser.hpp
Lines changed: 36 additions & 39 deletions
diff --git a/‎docs/sd3.md
Lines changed: 20 additions & 0 deletions b/‎docs/sd3.md
Lines changed: 20 additions & 0 deletions
diff --git a/‎examples/cli/main.cpp
Lines changed: 19 additions & 10 deletions b/‎examples/cli/main.cpp
Lines changed: 19 additions & 10 deletions
diff --git a/‎ggml_extend.hpp
Lines changed: 4 additions & 5 deletions b/‎ggml_extend.hpp
Lines changed: 4 additions & 5 deletions
@@ -1001,8 +1001,8 @@ struct FluxCLIPEmbedder : public Conditioner {
     }
 
     void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
-        clip_l->get_param_tensors(tensors, "text_encoders.clip_l.text_model");
-        t5->get_param_tensors(tensors, "text_encoders.t5xxl");
+        clip_l->get_param_tensors(tensors, "text_encoders.clip_l.transformer.text_model");
+        t5->get_param_tensors(tensors, "text_encoders.t5xxl.transformer");
     }
 
     void alloc_params_buffer() {
 
@@ -53,7 +53,7 @@ struct ExponentialSchedule : SigmaSchedule {
         // Calculate step size
         float log_sigma_min = std::log(sigma_min);
         float log_sigma_max = std::log(sigma_max);
-        float step = (log_sigma_max - log_sigma_min) / (n - 1);
+        float step          = (log_sigma_max - log_sigma_min) / (n - 1);
 
         // Fill sigmas with exponential values
         for (uint32_t i = 0; i < n; ++i) {
@@ -209,7 +209,7 @@ struct AYSSchedule : SigmaSchedule {
 
 /*
  * GITS Scheduler: https://github.com/zju-pi/diff-sampler/tree/main/gits-main
-*/
+ */
 struct GITSSchedule : SigmaSchedule {
     std::vector<float> get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) {
         if (sigma_max <= 0.0f) {
@@ -225,7 +225,7 @@ struct GITSSchedule : SigmaSchedule {
         // Calculate the index based on the coefficient
         int index = static_cast<int>((coeff - 0.80f) / 0.05f);
         // Ensure the index is within bounds
-        index = std::max(0, std::min(index, static_cast<int>(GITS_NOISE.size() - 1)));
+        index                                                 = std::max(0, std::min(index, static_cast<int>(GITS_NOISE.size() - 1)));
         const std::vector<std::vector<float>>& selected_noise = *GITS_NOISE[index];
 
         if (n <= 20) {
@@ -841,7 +841,7 @@ static void sample_k_diffusion(sample_method_t method,
         } break;
         case IPNDM:  // iPNDM sampler from https://github.com/zju-pi/diff-sampler/tree/main/diff-solvers-main
         {
-            int max_order = 4;
+            int max_order       = 4;
             ggml_tensor* x_next = x;
             std::vector<ggml_tensor*> buffer_model;
 
@@ -852,15 +852,15 @@ static void sample_k_diffusion(sample_method_t method,
                 float sigma_next = sigmas[i + 1];
 
                 ggml_tensor* x_cur = x_next;
-                float* vec_x_cur = (float*)x_cur->data;
-                float* vec_x_next = (float*)x_next->data;
+                float* vec_x_cur   = (float*)x_cur->data;
+                float* vec_x_next  = (float*)x_next->data;
 
                 // Denoising step
                 ggml_tensor* denoised = model(x_cur, sigma, i + 1);
-                float* vec_denoised = (float*)denoised->data;
+                float* vec_denoised   = (float*)denoised->data;
                 // d_cur = (x_cur - denoised) / sigma
                 struct ggml_tensor* d_cur = ggml_dup_tensor(work_ctx, x_cur);
-                float* vec_d_cur = (float*)d_cur->data;
+                float* vec_d_cur          = (float*)d_cur->data;
 
                 for (int j = 0; j < ggml_nelements(d_cur); j++) {
                     vec_d_cur[j] = (vec_x_cur[j] - vec_denoised[j]) / sigma;
@@ -877,34 +877,31 @@ static void sample_k_diffusion(sample_method_t method,
                         break;
 
                     case 2:  // Use one history point
-                        {
-                            float* vec_d_prev1 = (float*)buffer_model.back()->data;
-                            for (int j = 0; j < ggml_nelements(x_next); j++) {
-                                vec_x_next[j] = vec_x_cur[j] + (sigma_next - sigma) * (3 * vec_d_cur[j] - vec_d_prev1[j]) / 2;
-                            }
+                    {
+                        float* vec_d_prev1 = (float*)buffer_model.back()->data;
+                        for (int j = 0; j < ggml_nelements(x_next); j++) {
+                            vec_x_next[j] = vec_x_cur[j] + (sigma_next - sigma) * (3 * vec_d_cur[j] - vec_d_prev1[j]) / 2;
                         }
-                        break;
+                    } break;
 
                     case 3:  // Use two history points
-                        {
-                            float* vec_d_prev1 = (float*)buffer_model.back()->data;
-                            float* vec_d_prev2 = (float*)buffer_model[buffer_model.size() - 2]->data;
-                            for (int j = 0; j < ggml_nelements(x_next); j++) {
-                                vec_x_next[j] = vec_x_cur[j] + (sigma_next - sigma) * (23 * vec_d_cur[j] - 16 * vec_d_prev1[j] + 5 * vec_d_prev2[j]) / 12;
-                            }
+                    {
+                        float* vec_d_prev1 = (float*)buffer_model.back()->data;
+                        float* vec_d_prev2 = (float*)buffer_model[buffer_model.size() - 2]->data;
+                        for (int j = 0; j < ggml_nelements(x_next); j++) {
+                            vec_x_next[j] = vec_x_cur[j] + (sigma_next - sigma) * (23 * vec_d_cur[j] - 16 * vec_d_prev1[j] + 5 * vec_d_prev2[j]) / 12;
                         }
-                        break;
+                    } break;
 
                     case 4:  // Use three history points
-                        {
-                            float* vec_d_prev1 = (float*)buffer_model.back()->data;
-                            float* vec_d_prev2 = (float*)buffer_model[buffer_model.size() - 2]->data;
-                            float* vec_d_prev3 = (float*)buffer_model[buffer_model.size() - 3]->data;
-                            for (int j = 0; j < ggml_nelements(x_next); j++) {
-                                vec_x_next[j] = vec_x_cur[j] + (sigma_next - sigma) * (55 * vec_d_cur[j] - 59 * vec_d_prev1[j] + 37 * vec_d_prev2[j] - 9 * vec_d_prev3[j]) / 24;
-                            }
+                    {
+                        float* vec_d_prev1 = (float*)buffer_model.back()->data;
+                        float* vec_d_prev2 = (float*)buffer_model[buffer_model.size() - 2]->data;
+                        float* vec_d_prev3 = (float*)buffer_model[buffer_model.size() - 3]->data;
+                        for (int j = 0; j < ggml_nelements(x_next); j++) {
+                            vec_x_next[j] = vec_x_cur[j] + (sigma_next - sigma) * (55 * vec_d_cur[j] - 59 * vec_d_prev1[j] + 37 * vec_d_prev2[j] - 9 * vec_d_prev3[j]) / 24;
                         }
-                        break;
+                    } break;
                 }
 
                 // Manage buffer_model
@@ -932,23 +929,23 @@ static void sample_k_diffusion(sample_method_t method,
                 float t_next = sigmas[i + 1];
 
                 // Denoising step
-                ggml_tensor* denoised = model(x, sigma, i + 1);
-                float* vec_denoised = (float*)denoised->data;
+                ggml_tensor* denoised     = model(x, sigma, i + 1);
+                float* vec_denoised       = (float*)denoised->data;
                 struct ggml_tensor* d_cur = ggml_dup_tensor(work_ctx, x);
-                float* vec_d_cur = (float*)d_cur->data;
-                float* vec_x = (float*)x->data;
+                float* vec_d_cur          = (float*)d_cur->data;
+                float* vec_x              = (float*)x->data;
 
                 // d_cur = (x - denoised) / sigma
                 for (int j = 0; j < ggml_nelements(d_cur); j++) {
                     vec_d_cur[j] = (vec_x[j] - vec_denoised[j]) / sigma;
                 }
 
-                int order = std::min(max_order, i + 1);
-                float h_n = t_next - sigma;
+                int order   = std::min(max_order, i + 1);
+                float h_n   = t_next - sigma;
                 float h_n_1 = (i > 0) ? (sigma - sigmas[i - 1]) : h_n;
 
                 switch (order) {
-                    case 1:  // First Euler step 
+                    case 1:  // First Euler step
                         for (int j = 0; j < ggml_nelements(x_next); j++) {
                             vec_x[j] += vec_d_cur[j] * h_n;
                         }
@@ -963,7 +960,7 @@ static void sample_k_diffusion(sample_method_t method,
                     }
 
                     case 3: {
-                        float h_n_2 = (i > 1) ? (sigmas[i - 1] - sigmas[i - 2]) : h_n_1;
+                        float h_n_2        = (i > 1) ? (sigmas[i - 1] - sigmas[i - 2]) : h_n_1;
                         float* vec_d_prev1 = (float*)buffer_model.back()->data;
                         float* vec_d_prev2 = (buffer_model.size() > 1) ? (float*)buffer_model[buffer_model.size() - 2]->data : vec_d_prev1;
                         for (int j = 0; j < ggml_nelements(x_next); j++) {
@@ -973,8 +970,8 @@ static void sample_k_diffusion(sample_method_t method,
                     }
 
                     case 4: {
-                        float h_n_2 = (i > 1) ? (sigmas[i - 1] - sigmas[i - 2]) : h_n_1;
-                        float h_n_3 = (i > 2) ? (sigmas[i - 2] - sigmas[i - 3]) : h_n_2;
+                        float h_n_2        = (i > 1) ? (sigmas[i - 1] - sigmas[i - 2]) : h_n_1;
+                        float h_n_3        = (i > 2) ? (sigmas[i - 2] - sigmas[i - 3]) : h_n_2;
                         float* vec_d_prev1 = (float*)buffer_model.back()->data;
                         float* vec_d_prev2 = (buffer_model.size() > 1) ? (float*)buffer_model[buffer_model.size() - 2]->data : vec_d_prev1;
                         float* vec_d_prev3 = (buffer_model.size() > 2) ? (float*)buffer_model[buffer_model.size() - 3]->data : vec_d_prev2;
 
@@ -0,0 +1,20 @@
+# How to Use
+
+## Download weights
+
+- Download sd3.5_large from https://huggingface.co/stabilityai/stable-diffusion-3.5-large/blob/main/sd3.5_large.safetensors
+- Download clip_g from https://huggingface.co/Comfy-Org/stable-diffusion-3.5-fp8/blob/main/text_encoders/clip_g.safetensors
+- Download clip_l from https://huggingface.co/Comfy-Org/stable-diffusion-3.5-fp8/blob/main/text_encoders/clip_l.safetensors
+- Download t5xxl from https://huggingface.co/Comfy-Org/stable-diffusion-3.5-fp8/blob/main/text_encoders/t5xxl_fp16.safetensors
+
+
+## Run
+
+### SD3.5 Large
+For example:
+
+```
+.\bin\Release\sd.exe -m  ..\models\sd3.5_large.safetensors --clip_l ..\models\clip_l.safetensors --clip_g ..\models\clip_g.safetensors --t5xxl ..\models\t5xxl_fp16.safetensors  -H 1024 -W 1024 -p 'a lovely cat holding a sign says \"Stable diffusion 3.5 Large\"' --cfg-scale 4.5 --sampling-method euler -v
+```
+
+![](../assets/sd3.5_large.png)
@@ -69,9 +69,9 @@ enum SDMode {
 struct SDParams {
     int n_threads = -1;
     SDMode mode   = TXT2IMG;
-
     std::string model_path;
     std::string clip_l_path;
+    std::string clip_g_path;
     std::string t5xxl_path;
     std::string diffusion_model_path;
     std::string vae_path;
@@ -128,6 +128,7 @@ void print_params(SDParams params) {
     printf("    model_path:        %s\n", params.model_path.c_str());
     printf("    wtype:             %s\n", params.wtype < SD_TYPE_COUNT ? sd_type_name(params.wtype) : "unspecified");
     printf("    clip_l_path:       %s\n", params.clip_l_path.c_str());
+    printf("    clip_g_path:       %s\n", params.clip_g_path.c_str());
     printf("    t5xxl_path:        %s\n", params.t5xxl_path.c_str());
     printf("    diffusion_model_path:   %s\n", params.diffusion_model_path.c_str());
     printf("    vae_path:          %s\n", params.vae_path.c_str());
@@ -171,23 +172,24 @@ void print_usage(int argc, const char* argv[]) {
     printf("arguments:\n");
     printf("  -h, --help                         show this help message and exit\n");
     printf("  -M, --mode [MODEL]                 run mode (txt2img or img2img or convert, default: txt2img)\n");
-    printf("  -t, --threads N                    number of threads to use during computation (default: -1).\n");
+    printf("  -t, --threads N                    number of threads to use during computation (default: -1)\n");
     printf("                                     If threads <= 0, then threads will be set to the number of CPU physical cores\n");
     printf("  -m, --model [MODEL]                path to full model\n");
     printf("  --diffusion-model                  path to the standalone diffusion model\n");
     printf("  --clip_l                           path to the clip-l text encoder\n");
-    printf("  --t5xxl                            path to the the t5xxl text encoder.\n");
+    printf("  --clip_g                           path to the clip-l text encoder\n");
+    printf("  --t5xxl                            path to the the t5xxl text encoder\n");
     printf("  --vae [VAE]                        path to vae\n");
     printf("  --taesd [TAESD_PATH]               path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)\n");
     printf("  --control-net [CONTROL_PATH]       path to control net model\n");
-    printf("  --embd-dir [EMBEDDING_PATH]        path to embeddings.\n");
-    printf("  --stacked-id-embd-dir [DIR]        path to PHOTOMAKER stacked id embeddings.\n");
-    printf("  --input-id-images-dir [DIR]        path to PHOTOMAKER input id images dir.\n");
+    printf("  --embd-dir [EMBEDDING_PATH]        path to embeddings\n");
+    printf("  --stacked-id-embd-dir [DIR]        path to PHOTOMAKER stacked id embeddings\n");
+    printf("  --input-id-images-dir [DIR]        path to PHOTOMAKER input id images dir\n");
     printf("  --normalize-input                  normalize PHOTOMAKER input id images\n");
-    printf("  --upscale-model [ESRGAN_PATH]      path to esrgan model. Upscale images after generate, just RealESRGAN_x4plus_anime_6B supported by now.\n");
+    printf("  --upscale-model [ESRGAN_PATH]      path to esrgan model. Upscale images after generate, just RealESRGAN_x4plus_anime_6B supported by now\n");
     printf("  --upscale-repeats                  Run the ESRGAN upscaler this many times (default 1)\n");
     printf("  --type [TYPE]                      weight type (f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0, q2_k, q3_k, q4_k)\n");
-    printf("                                     If not specified, the default is the type of the weight file.\n");
+    printf("                                     If not specified, the default is the type of the weight file\n");
     printf("  --lora-model-dir [DIR]             lora model directory\n");
     printf("  -i, --init-img [IMAGE]             path to the input image, required by img2img\n");
     printf("  --control-image [IMAGE]            path to image condition, control net\n");
@@ -206,13 +208,13 @@ void print_usage(int argc, const char* argv[]) {
     printf("  --steps  STEPS                     number of sample steps (default: 20)\n");
     printf("  --rng {std_default, cuda}          RNG (default: cuda)\n");
     printf("  -s SEED, --seed SEED               RNG seed (default: 42, use random seed for < 0)\n");
-    printf("  -b, --batch-count COUNT            number of images to generate.\n");
+    printf("  -b, --batch-count COUNT            number of images to generate\n");
     printf("  --schedule {discrete, karras, exponential, ays, gits} Denoiser sigma schedule (default: discrete)\n");
     printf("  --clip-skip N                      ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1)\n");
     printf("                                     <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x\n");
     printf("  --vae-tiling                       process vae in tiles to reduce memory usage\n");
     printf("  --vae-on-cpu                       keep vae in cpu (for low vram)\n");
-    printf("  --clip-on-cpu                      keep clip in cpu (for low vram).\n");
+    printf("  --clip-on-cpu                      keep clip in cpu (for low vram)\n");
     printf("  --control-net-cpu                  keep controlnet in cpu (for low vram)\n");
     printf("  --canny                            apply canny preprocessor (edge detection)\n");
     printf("  --color                            Colors the logging tags according to level\n");
@@ -262,6 +264,12 @@ void parse_args(int argc, const char** argv, SDParams& params) {
                 break;
             }
             params.clip_l_path = argv[i];
+        } else if (arg == "--clip_g") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.clip_g_path = argv[i];
         } else if (arg == "--t5xxl") {
             if (++i >= argc) {
                 invalid_arg = true;
@@ -802,6 +810,7 @@ int main(int argc, char* argv[]) {
 
     sd_ctx_t* sd_ctx = new_sd_ctx(params.model_path.c_str(),
                                   params.clip_l_path.c_str(),
+                                  params.clip_g_path.c_str(),
                                   params.t5xxl_path.c_str(),
                                   params.diffusion_model_path.c_str(),
                                   params.vae_path.c_str(),
 
@@ -368,8 +368,8 @@ __STATIC_INLINE__ void ggml_merge_tensor_2d(struct ggml_tensor* input,
     int64_t height   = input->ne[1];
     int64_t channels = input->ne[2];
 
-    int64_t img_width    = output->ne[0];
-    int64_t img_height   = output->ne[1];
+    int64_t img_width  = output->ne[0];
+    int64_t img_height = output->ne[1];
 
     GGML_ASSERT(input->type == GGML_TYPE_F32 && output->type == GGML_TYPE_F32);
     for (int iy = 0; iy < height; iy++) {
@@ -380,7 +380,7 @@ __STATIC_INLINE__ void ggml_merge_tensor_2d(struct ggml_tensor* input,
                     float old_value = ggml_tensor_get_f32(output, x + ix, y + iy, k);
 
                     const float x_f_0 = (x > 0) ? ix / float(overlap) : 1;
-                    const float x_f_1 = (x < (img_width - width)) ? (width - ix) / float(overlap) : 1 ;
+                    const float x_f_1 = (x < (img_width - width)) ? (width - ix) / float(overlap) : 1;
                     const float y_f_0 = (y > 0) ? iy / float(overlap) : 1;
                     const float y_f_1 = (y < (img_height - height)) ? (height - iy) / float(overlap) : 1;
 
@@ -390,8 +390,7 @@ __STATIC_INLINE__ void ggml_merge_tensor_2d(struct ggml_tensor* input,
                     ggml_tensor_set_f32(
                         output,
                         old_value + new_value * ggml_smootherstep_f32(y_f) * ggml_smootherstep_f32(x_f),
-                        x + ix, y + iy, k
-                    );
+                        x + ix, y + iy, k);
                 } else {
                     ggml_tensor_set_f32(output, new_value, x + ix, y + iy, k);
                 }
Original file line number	Diff line number	Diff line change
`@@ -1001,8 +1001,8 @@ struct FluxCLIPEmbedder : public Conditioner {`
`1001`	`1001`	`}`
`1002`	`1002`
`1003`	`1003`	`void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {`
`1004`		`- clip_l->get_param_tensors(tensors, "text_encoders.clip_l.text_model");`
`1005`		`- t5->get_param_tensors(tensors, "text_encoders.t5xxl");`
	`1004`	`+ clip_l->get_param_tensors(tensors, "text_encoders.clip_l.transformer.text_model");`
	`1005`	`+ t5->get_param_tensors(tensors, "text_encoders.t5xxl.transformer");`
`1006`	`1006`	`}`
`1007`	`1007`
`1008`	`1008`	`void alloc_params_buffer() {`