leejet · leejet · Jul 12, 2025 · Jul 7, 2025 · Jul 7, 2025 · Jul 7, 2025
diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
@@ -96,15 +96,16 @@ struct SDParams {
 
     std::string prompt;
     std::string negative_prompt;
-    float min_cfg     = 1.0f;
-    float cfg_scale   = 7.0f;
-    float guidance    = 3.5f;
-    float eta         = 0.f;
-    float style_ratio = 20.f;
-    int clip_skip     = -1;  // <= 0 represents unspecified
-    int width         = 512;
-    int height        = 512;
-    int batch_count   = 1;
+    float min_cfg       = 1.0f;
+    float cfg_scale     = 7.0f;
+    float img_cfg_scale = INFINITY;
+    float guidance      = 3.5f;
+    float eta           = 0.f;
+    float style_ratio   = 20.f;
+    int clip_skip       = -1;  // <= 0 represents unspecified
+    int width           = 512;
+    int height          = 512;
+    int batch_count     = 1;
 
     int video_frames         = 6;
     int motion_bucket_id     = 127;
@@ -175,6 +176,7 @@ void print_params(SDParams params) {
     printf("    negative_prompt:   %s\n", params.negative_prompt.c_str());
     printf("    min_cfg:           %.2f\n", params.min_cfg);
     printf("    cfg_scale:         %.2f\n", params.cfg_scale);
+    printf("    img_cfg_scale:     %.2f\n", params.img_cfg_scale);
     printf("    slg_scale:         %.2f\n", params.slg_scale);
     printf("    guidance:          %.2f\n", params.guidance);
     printf("    eta:               %.2f\n", params.eta);
@@ -232,7 +234,8 @@ void print_usage(int argc, const char* argv[]) {
     printf("  -p, --prompt [PROMPT]              the prompt to render\n");
     printf("  -n, --negative-prompt PROMPT       the negative prompt (default: \"\")\n");
     printf("  --cfg-scale SCALE                  unconditional guidance scale: (default: 7.0)\n");
-    printf("  --guidance SCALE                   guidance scale for img2img (default: 3.5)\n");
+    printf("  --img-cfg-scale SCALE              image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)\n");
+    printf("  --guidance SCALE                   distilled guidance scale for models with guidance input (default: 3.5)\n");
     printf("  --slg-scale SCALE                  skip layer guidance (SLG) scale, only for DiT models: (default: 0)\n");
     printf("                                     0 means disabled, a value of 2.5 is nice for sd3.5 medium\n");
     printf("  --eta SCALE                        eta in DDIM, only for DDIM and TCD: (default: 0)\n");
@@ -462,6 +465,12 @@ void parse_args(int argc, const char** argv, SDParams& params) {
                 break;
             }
             params.cfg_scale = std::stof(argv[i]);
+        } else if (arg == "--img-cfg-scale") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.img_cfg_scale = std::stof(argv[i]);
         } else if (arg == "--guidance") {
             if (++i >= argc) {
                 invalid_arg = true;
@@ -743,6 +752,10 @@ void parse_args(int argc, const char** argv, SDParams& params) {
             params.output_path = "output.gguf";
         }
     }
+
+    if (!isfinite(params.img_cfg_scale)) {
+        params.img_cfg_scale = params.cfg_scale;
+    }
 }
 
 static std::string sd_basename(const std::string& path) {
@@ -837,6 +850,18 @@ int main(int argc, const char* argv[]) {
 
     parse_args(argc, argv, params);
 
+    sd_guidance_params_t guidance_params = {params.cfg_scale,
+                                            params.img_cfg_scale,
+                                            params.min_cfg,
+                                            params.guidance,
+                                            {
+                                                params.skip_layers.data(),
+                                                params.skip_layers.size(),
+                                                params.skip_layer_start,
+                                                params.skip_layer_end,
+                                                params.slg_scale,
+                                            }};
+
     sd_set_log_callback(sd_log_cb, (void*)&params);
 
     if (params.verbose) {
@@ -1029,8 +1054,7 @@ int main(int argc, const char* argv[]) {
                           params.prompt.c_str(),
                           params.negative_prompt.c_str(),
                           params.clip_skip,
-                          params.cfg_scale,
-                          params.guidance,
+                          guidance_params,
                           params.eta,
                           params.width,
                           params.height,
@@ -1042,12 +1066,7 @@ int main(int argc, const char* argv[]) {
                           params.control_strength,
                           params.style_ratio,
                           params.normalize_input,
-                          params.input_id_images_path.c_str(),
-                          params.skip_layers.data(),
-                          params.skip_layers.size(),
-                          params.slg_scale,
-                          params.skip_layer_start,
-                          params.skip_layer_end);
+                          params.input_id_images_path.c_str());
     } else if (params.mode == IMG2IMG || params.mode == IMG2VID) {
         sd_image_t input_image = {(uint32_t)params.width,
                                   (uint32_t)params.height,
@@ -1063,8 +1082,7 @@ int main(int argc, const char* argv[]) {
                               params.motion_bucket_id,
                               params.fps,
                               params.augmentation_level,
-                              params.min_cfg,
-                              params.cfg_scale,
+                              guidance_params,
                               params.sample_method,
                               params.sample_steps,
                               params.strength,
@@ -1097,8 +1115,7 @@ int main(int argc, const char* argv[]) {
                               params.prompt.c_str(),
                               params.negative_prompt.c_str(),
                               params.clip_skip,
-                              params.cfg_scale,
-                              params.guidance,
+                              guidance_params,
                               params.eta,
                               params.width,
                               params.height,
@@ -1111,12 +1128,7 @@ int main(int argc, const char* argv[]) {
                               params.control_strength,
                               params.style_ratio,
                               params.normalize_input,
-                              params.input_id_images_path.c_str(),
-                              params.skip_layers.data(),
-                              params.skip_layers.size(),
-                              params.slg_scale,
-                              params.skip_layer_start,
-                              params.skip_layer_end);
+                              params.input_id_images_path.c_str());
         }
     } else {  // EDIT
         results = edit(sd_ctx,
@@ -1125,25 +1137,19 @@ int main(int argc, const char* argv[]) {
                        params.prompt.c_str(),
                        params.negative_prompt.c_str(),
                        params.clip_skip,
-                       params.cfg_scale,
-                       params.guidance,
+                       guidance_params,
                        params.eta,
                        params.width,
                        params.height,
                        params.sample_method,
                        params.sample_steps,
-                       params.strength,
                        params.seed,
                        params.batch_count,
                        control_image,
                        params.control_strength,
                        params.style_ratio,
                        params.normalize_input,
-                       params.skip_layers.data(),
-                       params.skip_layers.size(),
-                       params.slg_scale,
-                       params.skip_layer_start,
-                       params.skip_layer_end);
+                       params.input_id_images_path.c_str());
     }
 
     if (results == NULL) {

diff --git a/model.cpp b/model.cpp
@@ -100,7 +100,7 @@ const char* unused_tensors[] = {
     "model_ema.diffusion_model",
     "embedding_manager",
     "denoiser.sigmas",
-    "text_encoders.t5xxl.transformer.encoder.embed_tokens.weight", // only used during training
+    "text_encoders.t5xxl.transformer.encoder.embed_tokens.weight",  // only used during training
 };
 
 bool is_unused_tensor(std::string name) {
@@ -1169,7 +1169,6 @@ bool ModelLoader::init_from_safetensors_file(const std::string& file_path, const
             n_dims = 1;
         }
 
-
         TensorStorage tensor_storage(prefix + name, type, ne, n_dims, file_index, ST_HEADER_SIZE_LEN + header_size_ + begin);
         tensor_storage.reverse_ne();
 
@@ -1674,10 +1673,14 @@ SDVersion ModelLoader::get_sd_version() {
         }
     }
     bool is_inpaint = input_block_weight.ne[2] == 9;
+    bool is_ip2p    = input_block_weight.ne[2] == 8;
     if (is_xl) {
         if (is_inpaint) {
             return VERSION_SDXL_INPAINT;
         }
+        if (is_ip2p) {
+            return VERSION_SDXL_PIX2PIX;
+        }
         return VERSION_SDXL;
     }
 
@@ -1693,6 +1696,9 @@ SDVersion ModelLoader::get_sd_version() {
         if (is_inpaint) {
             return VERSION_SD1_INPAINT;
         }
+        if (is_ip2p) {
+            return VERSION_SD1_PIX2PIX;
+        }
         return VERSION_SD1;
     } else if (token_embedding_weight.ne[0] == 1024) {
         if (is_inpaint) {
@@ -1914,7 +1920,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend
         };
         int tensor_count = 0;
         int64_t t1       = ggml_time_ms();
-        bool partial = false;
+        bool partial     = false;
         for (auto& tensor_storage : processed_tensor_storages) {
             if (tensor_storage.file_index != file_index) {
                 ++tensor_count;
@@ -1997,9 +2003,9 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, ggml_backend
                 }
             }
             size_t tensor_max = processed_tensor_storages.size();
-            int64_t t2 = ggml_time_ms();
+            int64_t t2        = ggml_time_ms();
             pretty_progress(++tensor_count, tensor_max, (t2 - t1) / 1000.0f);
-            t1 = t2;
+            t1      = t2;
             partial = tensor_count != tensor_max;
         }
 

diff --git a/model.h b/model.h
@@ -21,10 +21,12 @@
 enum SDVersion {
     VERSION_SD1,
     VERSION_SD1_INPAINT,
+    VERSION_SD1_PIX2PIX,
     VERSION_SD2,
     VERSION_SD2_INPAINT,
     VERSION_SDXL,
     VERSION_SDXL_INPAINT,
+    VERSION_SDXL_PIX2PIX,
     VERSION_SVD,
     VERSION_SD3,
     VERSION_FLUX,
@@ -47,7 +49,7 @@ static inline bool sd_version_is_sd3(SDVersion version) {
 }
 
 static inline bool sd_version_is_sd1(SDVersion version) {
-    if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT) {
+    if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT || version == VERSION_SD1_PIX2PIX) {
         return true;
     }
     return false;
@@ -61,7 +63,7 @@ static inline bool sd_version_is_sd2(SDVersion version) {
 }
 
 static inline bool sd_version_is_sdxl(SDVersion version) {
-    if (version == VERSION_SDXL || version == VERSION_SDXL_INPAINT) {
+    if (version == VERSION_SDXL || version == VERSION_SDXL_INPAINT || version == VERSION_SDXL_PIX2PIX) {
         return true;
     }
     return false;
@@ -81,6 +83,14 @@ static inline bool sd_version_is_dit(SDVersion version) {
     return false;
 }
 
+static inline bool sd_version_is_unet_edit(SDVersion version) {
+    return version == VERSION_SD1_PIX2PIX || version == VERSION_SDXL_PIX2PIX;
+}
+
+static bool sd_version_is_inpaint_or_unet_edit(SDVersion version) {
+    return sd_version_is_unet_edit(version) || sd_version_is_inpaint(version);
+}
+
 enum PMVersion {
     PM_VERSION_1,
     PM_VERSION_2,