From a5dbce592790118582d8f41b2b710a153ae524fe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Wed, 12 Feb 2025 00:15:59 +0100
Subject: [PATCH 1/8] apg: first implementation

---
 stable-diffusion.cpp | 60 ++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 58 insertions(+), 2 deletions(-)
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index e38a6101..30754cf6 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -847,6 +847,15 @@ class StableDiffusionGGML {
         }
         struct ggml_tensor* denoised = ggml_dup_tensor(work_ctx, x);
 
+        // TODO do not hardcode
+        float apg_eta           = .08f;
+        float apg_momentum      = -.5f;
+        float apg_norm_treshold = 15.0f;
+
+        std::vector<float> apg_momentum_buffer;
+        if (apg_momentum != 0)
+            apg_momentum_buffer.resize((size_t)ggml_nelements(denoised));
+
         auto denoise = [&](ggml_tensor* input, float sigma, int step) -> ggml_tensor* {
             if (step == 1) {
                 pretty_progress(0, (int)steps, 0);
@@ -951,6 +960,50 @@ class StableDiffusionGGML {
             float* vec_input     = (float*)input->data;
             float* positive_data = (float*)out_cond->data;
             int ne_elements      = (int)ggml_nelements(denoised);
+
+            float* deltas = vec_denoised;
+
+            // https://arxiv.org/pdf/2410.02416
+            float apg_scale_factor = 1.;
+            float diff_norm        = 0;
+            float cond_norm_sq     = 0;
+            float dot              = 0;
+            for (int i = 0; i < ne_elements; i++) {
+                float delta = positive_data[i] - negative_data[i];
+                if (apg_momentum != 0) {
+                    delta += apg_momentum * apg_momentum_buffer[i];
+                    apg_momentum_buffer[i] = delta;
+                }
+                if (apg_norm_treshold > 0) {
+                    diff_norm += delta * delta;
+                }
+                if (apg_eta != 1.0f) {
+                    cond_norm_sq += positive_data[i] * positive_data[i];
+                    dot += positive_data[i] * delta;
+                }
+                deltas[i] = delta;
+            }
+            if (apg_norm_treshold > 0) {
+                diff_norm        = std::sqrtf(diff_norm);
+                apg_scale_factor = std::min(1.0f, apg_norm_treshold / diff_norm);
+            }
+            if (apg_eta != 1.0f) {
+                dot *= apg_scale_factor;
+                // pre-normalize (avoids one square root and ne_elements extra divs)
+                dot /= cond_norm_sq;
+            }
+
+            for (int i = 0; i < ne_elements; i++) {
+                deltas[i] *= apg_scale_factor;
+                if (apg_eta != 1.0f) {
+                    float apg_parallel   = dot * positive_data[i];
+                    float apg_orthogonal = deltas[i] - apg_parallel;
+
+                    // tweak deltas
+                    deltas[i] = apg_orthogonal + apg_eta * apg_parallel;
+                }
+            }
+
             for (int i = 0; i < ne_elements; i++) {
                 float latent_result = positive_data[i];
                 if (has_unconditioned) {
@@ -960,7 +1013,9 @@ class StableDiffusionGGML {
                         int64_t i3  = i / out_cond->ne[0] * out_cond->ne[1] * out_cond->ne[2];
                         float scale = min_cfg + (cfg_scale - min_cfg) * (i3 * 1.0f / ne3);
                     } else {
-                        latent_result = negative_data[i] + cfg_scale * (positive_data[i] - negative_data[i]);
+                        float delta = deltas[i];
+
+                        latent_result = positive_data[i] + (cfg_scale - 1) * delta;
                     }
                 }
                 if (is_skiplayer_step) {
@@ -1004,7 +1059,8 @@ class StableDiffusionGGML {
     }
 
     // ldm.models.diffusion.ddpm.LatentDiffusion.get_first_stage_encoding
-    ggml_tensor* get_first_stage_encoding(ggml_context* work_ctx, ggml_tensor* moments) {
+    ggml_tensor*
+    get_first_stage_encoding(ggml_context* work_ctx, ggml_tensor* moments) {
         // ldm.modules.distributions.distributions.DiagonalGaussianDistribution.sample
         ggml_tensor* latent       = ggml_new_tensor_4d(work_ctx, moments->type, moments->ne[0], moments->ne[1], moments->ne[2] / 2, moments->ne[3]);
         struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, latent);

From 02114c2e9a60ee45f075509da28863b6feace192 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Wed, 12 Feb 2025 01:04:40 +0100
Subject: [PATCH 2/8] refactor guidance params in lib

---
 examples/cli/main.cpp | 32 +++++++++--------
 stable-diffusion.cpp  | 83 ++++++++++++++++---------------------------
 stable-diffusion.h    | 28 +++++++++------
 3 files changed, 66 insertions(+), 77 deletions(-)

diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index af6b2bbd..8a9114f0 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -963,11 +963,12 @@ int main(int argc, const char* argv[]) {
                           params.style_ratio,
                           params.normalize_input,
                           params.input_id_images_path.c_str(),
-                          params.skip_layers.data(),
-                          params.skip_layers.size(),
-                          params.slg_scale,
-                          params.skip_layer_start,
-                          params.skip_layer_end);
+                          sd_slg_params_t{params.skip_layers.data(),
+                                          params.skip_layers.size(),
+                                          params.slg_scale,
+                                          params.skip_layer_start,
+                                          params.skip_layer_end},
+                          sd_apg_params_t{1, 0, 0});
     } else {
         sd_image_t input_image = {(uint32_t)params.width,
                                   (uint32_t)params.height,
@@ -1032,11 +1033,12 @@ int main(int argc, const char* argv[]) {
                               params.style_ratio,
                               params.normalize_input,
                               params.input_id_images_path.c_str(),
-                              params.skip_layers.data(),
-                              params.skip_layers.size(),
-                              params.slg_scale,
-                              params.skip_layer_start,
-                              params.skip_layer_end);
+                              sd_slg_params_t{params.skip_layers.data(),
+                                              params.skip_layers.size(),
+                                              params.slg_scale,
+                                              params.skip_layer_start,
+                                              params.skip_layer_end},
+                              sd_apg_params_t{1, 0, 0});
         }
     }
 
@@ -1075,11 +1077,11 @@ int main(int argc, const char* argv[]) {
 
     std::string dummy_name, ext, lc_ext;
     bool is_jpg;
-    size_t last = params.output_path.find_last_of(".");
+    size_t last      = params.output_path.find_last_of(".");
     size_t last_path = std::min(params.output_path.find_last_of("/"),
                                 params.output_path.find_last_of("\\"));
-    if (last != std::string::npos // filename has extension
-    && (last_path == std::string::npos || last > last_path)) {
+    if (last != std::string::npos  // filename has extension
+        && (last_path == std::string::npos || last > last_path)) {
         dummy_name = params.output_path.substr(0, last);
         ext = lc_ext = params.output_path.substr(last);
         std::transform(ext.begin(), ext.end(), lc_ext.begin(), ::tolower);
@@ -1087,7 +1089,7 @@ int main(int argc, const char* argv[]) {
     } else {
         dummy_name = params.output_path;
         ext = lc_ext = "";
-        is_jpg = false;
+        is_jpg       = false;
     }
     // appending ".png" to absent or unknown extension
     if (!is_jpg && lc_ext != ".png") {
@@ -1099,7 +1101,7 @@ int main(int argc, const char* argv[]) {
             continue;
         }
         std::string final_image_path = i > 0 ? dummy_name + "_" + std::to_string(i + 1) + ext : dummy_name + ext;
-        if(is_jpg) {
+        if (is_jpg) {
             stbi_write_jpg(final_image_path.c_str(), results[i].width, results[i].height, results[i].channel,
                            results[i].data, 90, get_image_params(params, params.seed + i).c_str());
             printf("save result JPEG image to '%s'\n", final_image_path.c_str());
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 30754cf6..bc5bdec0 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -800,11 +800,11 @@ class StableDiffusionGGML {
                         const std::vector<float>& sigmas,
                         int start_merge_step,
                         SDCondition id_cond,
-                        std::vector<int> skip_layers = {},
-                        float slg_scale              = 0,
-                        float skip_layer_start       = 0.01,
-                        float skip_layer_end         = 0.2,
-                        ggml_tensor* noise_mask      = nullptr) {
+                        sd_slg_params_t slg_params = {NULL, 0, 0, 0, 0},
+                        sd_apg_params_t apg_params = {1, 0, 0},
+                        ggml_tensor* noise_mask  = nullptr) {
+        std::vector<int> skip_layers(slg_params.skip_layers, slg_params.skip_layers + slg_params.skip_layers_count);
+
         LOG_DEBUG("Sample");
         struct ggml_init_params params;
         size_t data_size = ggml_row_size(init_latent->type, init_latent->ne[0]);
@@ -827,7 +827,7 @@ class StableDiffusionGGML {
         struct ggml_tensor* noised_input = ggml_dup_tensor(work_ctx, noise);
 
         bool has_unconditioned = cfg_scale != 1.0 && uncond.c_crossattn != NULL;
-        bool has_skiplayer     = slg_scale != 0.0 && skip_layers.size() > 0;
+        bool has_skiplayer     = slg_params.scale != 0.0 && skip_layers.size() > 0;
 
         // denoise wrapper
         struct ggml_tensor* out_cond   = ggml_dup_tensor(work_ctx, x);
@@ -847,13 +847,8 @@ class StableDiffusionGGML {
         }
         struct ggml_tensor* denoised = ggml_dup_tensor(work_ctx, x);
 
-        // TODO do not hardcode
-        float apg_eta           = .08f;
-        float apg_momentum      = -.5f;
-        float apg_norm_treshold = 15.0f;
-
         std::vector<float> apg_momentum_buffer;
-        if (apg_momentum != 0)
+        if (apg_params.momentum != 0)
             apg_momentum_buffer.resize((size_t)ggml_nelements(denoised));
 
         auto denoise = [&](ggml_tensor* input, float sigma, int step) -> ggml_tensor* {
@@ -936,7 +931,7 @@ class StableDiffusionGGML {
             }
 
             int step_count         = sigmas.size();
-            bool is_skiplayer_step = has_skiplayer && step > (int)(skip_layer_start * step_count) && step < (int)(skip_layer_end * step_count);
+            bool is_skiplayer_step = has_skiplayer && step > (int)(slg_params.skip_layer_start * step_count) && step < (int)(slg_params.skip_layer_end * step_count);
             float* skip_layer_data = NULL;
             if (is_skiplayer_step) {
                 LOG_DEBUG("Skipping layers at step %d\n", step);
@@ -970,24 +965,24 @@ class StableDiffusionGGML {
             float dot              = 0;
             for (int i = 0; i < ne_elements; i++) {
                 float delta = positive_data[i] - negative_data[i];
-                if (apg_momentum != 0) {
-                    delta += apg_momentum * apg_momentum_buffer[i];
+                if (apg_params.momentum != 0) {
+                    delta += apg_params.momentum * apg_momentum_buffer[i];
                     apg_momentum_buffer[i] = delta;
                 }
-                if (apg_norm_treshold > 0) {
+                if (apg_params.norm_treshold > 0) {
                     diff_norm += delta * delta;
                 }
-                if (apg_eta != 1.0f) {
+                if (apg_params.eta != 1.0f) {
                     cond_norm_sq += positive_data[i] * positive_data[i];
                     dot += positive_data[i] * delta;
                 }
                 deltas[i] = delta;
             }
-            if (apg_norm_treshold > 0) {
+            if (apg_params.norm_treshold > 0) {
                 diff_norm        = std::sqrtf(diff_norm);
-                apg_scale_factor = std::min(1.0f, apg_norm_treshold / diff_norm);
+                apg_scale_factor = std::min(1.0f, apg_params.norm_treshold / diff_norm);
             }
-            if (apg_eta != 1.0f) {
+            if (apg_params.eta != 1.0f) {
                 dot *= apg_scale_factor;
                 // pre-normalize (avoids one square root and ne_elements extra divs)
                 dot /= cond_norm_sq;
@@ -995,12 +990,12 @@ class StableDiffusionGGML {
 
             for (int i = 0; i < ne_elements; i++) {
                 deltas[i] *= apg_scale_factor;
-                if (apg_eta != 1.0f) {
+                if (apg_params.eta != 1.0f) {
                     float apg_parallel   = dot * positive_data[i];
                     float apg_orthogonal = deltas[i] - apg_parallel;
 
                     // tweak deltas
-                    deltas[i] = apg_orthogonal + apg_eta * apg_parallel;
+                    deltas[i] = apg_orthogonal + apg_params.eta * apg_parallel;
                 }
             }
 
@@ -1019,7 +1014,7 @@ class StableDiffusionGGML {
                     }
                 }
                 if (is_skiplayer_step) {
-                    latent_result = latent_result + (positive_data[i] - skip_layer_data[i]) * slg_scale;
+                    latent_result = latent_result + (positive_data[i] - skip_layer_data[i]) * slg_params.scale;
                 }
                 // v = latent_result, eps = latent_result
                 // denoised = (v * c_out + input * c_skip) or (input + eps * c_out)
@@ -1265,11 +1260,9 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
                            float style_ratio,
                            bool normalize_input,
                            std::string input_id_images_path,
-                           std::vector<int> skip_layers = {},
-                           float slg_scale              = 0,
-                           float skip_layer_start       = 0.01,
-                           float skip_layer_end         = 0.2,
-                           ggml_tensor* masked_image    = NULL) {
+                           sd_slg_params_t slg_params,
+                           sd_apg_params_t apg_params,
+                           ggml_tensor* masked_image = NULL) {
     if (seed < 0) {
         // Generally, when using the provided command line, the seed is always >0.
         // However, to prevent potential issues if 'stable-diffusion.cpp' is invoked as a library
@@ -1522,10 +1515,8 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
                                                      sigmas,
                                                      start_merge_step,
                                                      id_cond,
-                                                     skip_layers,
-                                                     slg_scale,
-                                                     skip_layer_start,
-                                                     skip_layer_end,
+                                                     slg_params,
+                                                     apg_params,
                                                      noise_mask);
 
         // struct ggml_tensor* x_0 = load_tensor_from_file(ctx, "samples_ddim.bin");
@@ -1595,12 +1586,8 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
                     float style_ratio,
                     bool normalize_input,
                     const char* input_id_images_path_c_str,
-                    int* skip_layers         = NULL,
-                    size_t skip_layers_count = 0,
-                    float slg_scale          = 0,
-                    float skip_layer_start   = 0.01,
-                    float skip_layer_end     = 0.2) {
-    std::vector<int> skip_layers_vec(skip_layers, skip_layers + skip_layers_count);
+                    sd_slg_params_t slg_params,
+                    sd_apg_params_t apg_params) {
     LOG_DEBUG("txt2img %dx%d", width, height);
     if (sd_ctx == NULL) {
         return NULL;
@@ -1674,10 +1661,8 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
                                                style_ratio,
                                                normalize_input,
                                                input_id_images_path_c_str,
-                                               skip_layers_vec,
-                                               slg_scale,
-                                               skip_layer_start,
-                                               skip_layer_end);
+                                               slg_params,
+                                               apg_params);
 
     size_t t1 = ggml_time_ms();
 
@@ -1707,12 +1692,8 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
                     float style_ratio,
                     bool normalize_input,
                     const char* input_id_images_path_c_str,
-                    int* skip_layers         = NULL,
-                    size_t skip_layers_count = 0,
-                    float slg_scale          = 0,
-                    float skip_layer_start   = 0.01,
-                    float skip_layer_end     = 0.2) {
-    std::vector<int> skip_layers_vec(skip_layers, skip_layers + skip_layers_count);
+                    sd_slg_params_t slg_params,
+                    sd_apg_params_t apg_params) {
     LOG_DEBUG("img2img %dx%d", width, height);
     if (sd_ctx == NULL) {
         return NULL;
@@ -1854,10 +1835,8 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
                                                style_ratio,
                                                normalize_input,
                                                input_id_images_path_c_str,
-                                               skip_layers_vec,
-                                               slg_scale,
-                                               skip_layer_start,
-                                               skip_layer_end,
+                                               slg_params,
+                                               apg_params,
                                                masked_image);
 
     size_t t2 = ggml_time_ms();
diff --git a/stable-diffusion.h b/stable-diffusion.h
index 52dcc848..e367d7dd 100644
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@@ -127,6 +127,20 @@ typedef struct {
     uint8_t* data;
 } sd_image_t;
 
+typedef struct {
+    float eta;
+    float momentum;
+    float norm_treshold;
+} sd_apg_params_t;
+
+typedef struct {
+    int* skip_layers;
+    size_t skip_layers_count;
+    float scale;
+    float skip_layer_start;
+    float skip_layer_end;
+} sd_slg_params_t;
+
 typedef struct sd_ctx_t sd_ctx_t;
 
 SD_API sd_ctx_t* new_sd_ctx(const char* model_path,
@@ -172,11 +186,8 @@ SD_API sd_image_t* txt2img(sd_ctx_t* sd_ctx,
                            float style_strength,
                            bool normalize_input,
                            const char* input_id_images_path,
-                           int* skip_layers,
-                           size_t skip_layers_count,
-                           float slg_scale,
-                           float skip_layer_start,
-                           float skip_layer_end);
+                           sd_slg_params_t slg_params,
+                           sd_apg_params_t apg_params);
 
 SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx,
                            sd_image_t init_image,
@@ -199,11 +210,8 @@ SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx,
                            float style_strength,
                            bool normalize_input,
                            const char* input_id_images_path,
-                           int* skip_layers,
-                           size_t skip_layers_count,
-                           float slg_scale,
-                           float skip_layer_start,
-                           float skip_layer_end);
+                           sd_slg_params_t slg_params,
+                           sd_apg_params_t apg_params);
 
 SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx,
                            sd_image_t init_image,

From 98e056b6e2054760622513518b6c47fc56fc9c59 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Wed, 12 Feb 2025 01:38:05 +0100
Subject: [PATCH 3/8] main: add apg support

---
 examples/cli/main.cpp | 35 ++++++++++++++++++++++++++++++++---
 1 file changed, 32 insertions(+), 3 deletions(-)

diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index 8a9114f0..4edf5bc7 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -126,9 +126,13 @@ struct SDParams {
     int upscale_repeats           = 1;
 
     std::vector<int> skip_layers = {7, 8, 9};
-    float slg_scale              = 0.f;
+    float slg_scale              = 0.0f;
     float skip_layer_start       = 0.01f;
     float skip_layer_end         = 0.2f;
+
+    float apg_eta           = 1.0f;
+    float apg_momentum      = 0.0f;
+    float apg_norm_treshold = 0.0f;
 };
 
 void print_params(SDParams params) {
@@ -213,6 +217,9 @@ void print_usage(int argc, const char* argv[]) {
     printf("  -n, --negative-prompt PROMPT       the negative prompt (default: \"\")\n");
     printf("  --cfg-scale SCALE                  unconditional guidance scale: (default: 7.0)\n");
     printf("  --guidance SCALE                   guidance scale for img2img (default: 3.5)\n");
+    printf("  --apg-eta VALUE                    parallel projected guidance scale for APG (default: 1.0, recommended: between 0 and 1)\n");
+    printf("  --apg-momentum VALUE               CFG update direction momentum for APG (default: 0, recommended: around -0.5)\n");
+    printf("  --apg-nt, --apg-rescale VALUE      CFG update direction norm threshold for APG (default: 0 = disabled, recommended: 4-15)\n");
     printf("  --slg-scale SCALE                  skip layer guidance (SLG) scale, only for DiT models: (default: 0)\n");
     printf("                                     0 means disabled, a value of 2.5 is nice for sd3.5 medium\n");
     printf("  --eta SCALE                        eta in DDIM, only for DDIM and TCD: (default: 0)\n");
@@ -629,6 +636,24 @@ void parse_args(int argc, const char** argv, SDParams& params) {
                 break;
             }
             params.skip_layer_end = std::stof(argv[i]);
+        } else if (arg == "--apg-eta") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.apg_eta = std::stof(argv[i]);
+        } else if (arg == "--apg-momentum") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.apg_momentum = std::stof(argv[i]);
+        } else if (arg == "--apg-nt" || arg == "--apg-rescale") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.apg_norm_treshold = std::stof(argv[i]);
         } else {
             fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
             print_usage(argc, argv);
@@ -968,7 +993,9 @@ int main(int argc, const char* argv[]) {
                                           params.slg_scale,
                                           params.skip_layer_start,
                                           params.skip_layer_end},
-                          sd_apg_params_t{1, 0, 0});
+                          sd_apg_params_t{params.apg_eta,
+                                          params.apg_momentum,
+                                          params.apg_norm_treshold});
     } else {
         sd_image_t input_image = {(uint32_t)params.width,
                                   (uint32_t)params.height,
@@ -1038,7 +1065,9 @@ int main(int argc, const char* argv[]) {
                                               params.slg_scale,
                                               params.skip_layer_start,
                                               params.skip_layer_end},
-                              sd_apg_params_t{1, 0, 0});
+                              sd_apg_params_t{params.apg_eta,
+                                              params.apg_momentum,
+                                              params.apg_norm_treshold});
         }
     }
 

From e64b3b853a563774e7a0f4c5c7d0c2e63cab3a8e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Wed, 12 Feb 2025 02:17:45 +0100
Subject: [PATCH 4/8] add apg settings to image params

---
 examples/cli/main.cpp | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index 4edf5bc7..f34c4e1d 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -130,9 +130,9 @@ struct SDParams {
     float skip_layer_start       = 0.01f;
     float skip_layer_end         = 0.2f;
 
-    float apg_eta           = 1.0f;
-    float apg_momentum      = 0.0f;
-    float apg_norm_treshold = 0.0f;
+    float apg_eta            = 1.0f;
+    float apg_momentum       = 0.0f;
+    float apg_norm_threshold = 0.0f;
 };
 
 void print_params(SDParams params) {
@@ -653,7 +653,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
                 invalid_arg = true;
                 break;
             }
-            params.apg_norm_treshold = std::stof(argv[i]);
+            params.apg_norm_threshold = std::stof(argv[i]);
         } else {
             fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
             print_usage(argc, argv);
@@ -744,6 +744,15 @@ std::string get_image_params(SDParams params, int64_t seed) {
     }
     parameter_string += "Steps: " + std::to_string(params.sample_steps) + ", ";
     parameter_string += "CFG scale: " + std::to_string(params.cfg_scale) + ", ";
+    if (params.apg_eta != 1) {
+        parameter_string += "APG eta: " + std::to_string(params.apg_eta) + ", ";
+    }
+    if (params.apg_momentum != 0) {
+        parameter_string += "CFG momentum: " + std::to_string(params.apg_momentum) + ", ";
+    }
+    if (params.apg_norm_threshold != 0) {
+        parameter_string += "CFG normalization threshold: " + std::to_string(params.apg_norm_threshold) + ", ";
+    }
     if (params.slg_scale != 0 && params.skip_layers.size() != 0) {
         parameter_string += "SLG scale: " + std::to_string(params.cfg_scale) + ", ";
         parameter_string += "Skip layers: [";
@@ -995,7 +1004,7 @@ int main(int argc, const char* argv[]) {
                                           params.skip_layer_end},
                           sd_apg_params_t{params.apg_eta,
                                           params.apg_momentum,
-                                          params.apg_norm_treshold});
+                                          params.apg_norm_threshold});
     } else {
         sd_image_t input_image = {(uint32_t)params.width,
                                   (uint32_t)params.height,
@@ -1067,7 +1076,7 @@ int main(int argc, const char* argv[]) {
                                               params.skip_layer_end},
                               sd_apg_params_t{params.apg_eta,
                                               params.apg_momentum,
-                                              params.apg_norm_treshold});
+                                              params.apg_norm_threshold});
         }
     }
 

From 98064d0f7bd759b46d0ce6b0efc32405e5a4f3c4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Wed, 12 Feb 2025 03:08:58 +0100
Subject: [PATCH 5/8] Fix cfg 1 crash

---
 stable-diffusion.cpp | 56 +++++++++++++++++++++++---------------------
 1 file changed, 29 insertions(+), 27 deletions(-)

diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index bc5bdec0..be8dc2c6 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -802,7 +802,7 @@ class StableDiffusionGGML {
                         SDCondition id_cond,
                         sd_slg_params_t slg_params = {NULL, 0, 0, 0, 0},
                         sd_apg_params_t apg_params = {1, 0, 0},
-                        ggml_tensor* noise_mask  = nullptr) {
+                        ggml_tensor* noise_mask    = nullptr) {
         std::vector<int> skip_layers(slg_params.skip_layers, slg_params.skip_layers + slg_params.skip_layers_count);
 
         LOG_DEBUG("Sample");
@@ -963,39 +963,41 @@ class StableDiffusionGGML {
             float diff_norm        = 0;
             float cond_norm_sq     = 0;
             float dot              = 0;
-            for (int i = 0; i < ne_elements; i++) {
-                float delta = positive_data[i] - negative_data[i];
-                if (apg_params.momentum != 0) {
-                    delta += apg_params.momentum * apg_momentum_buffer[i];
-                    apg_momentum_buffer[i] = delta;
+            if (has_unconditioned) {
+                for (int i = 0; i < ne_elements; i++) {
+                    float delta = positive_data[i] - negative_data[i];
+                    if (apg_params.momentum != 0) {
+                        delta += apg_params.momentum * apg_momentum_buffer[i];
+                        apg_momentum_buffer[i] = delta;
+                    }
+                    if (apg_params.norm_treshold > 0) {
+                        diff_norm += delta * delta;
+                    }
+                    if (apg_params.eta != 1.0f) {
+                        cond_norm_sq += positive_data[i] * positive_data[i];
+                        dot += positive_data[i] * delta;
+                    }
+                    deltas[i] = delta;
                 }
                 if (apg_params.norm_treshold > 0) {
-                    diff_norm += delta * delta;
+                    diff_norm        = std::sqrtf(diff_norm);
+                    apg_scale_factor = std::min(1.0f, apg_params.norm_treshold / diff_norm);
                 }
                 if (apg_params.eta != 1.0f) {
-                    cond_norm_sq += positive_data[i] * positive_data[i];
-                    dot += positive_data[i] * delta;
+                    dot *= apg_scale_factor;
+                    // pre-normalize (avoids one square root and ne_elements extra divs)
+                    dot /= cond_norm_sq;
                 }
-                deltas[i] = delta;
-            }
-            if (apg_params.norm_treshold > 0) {
-                diff_norm        = std::sqrtf(diff_norm);
-                apg_scale_factor = std::min(1.0f, apg_params.norm_treshold / diff_norm);
-            }
-            if (apg_params.eta != 1.0f) {
-                dot *= apg_scale_factor;
-                // pre-normalize (avoids one square root and ne_elements extra divs)
-                dot /= cond_norm_sq;
-            }
 
-            for (int i = 0; i < ne_elements; i++) {
-                deltas[i] *= apg_scale_factor;
-                if (apg_params.eta != 1.0f) {
-                    float apg_parallel   = dot * positive_data[i];
-                    float apg_orthogonal = deltas[i] - apg_parallel;
+                for (int i = 0; i < ne_elements; i++) {
+                    deltas[i] *= apg_scale_factor;
+                    if (apg_params.eta != 1.0f) {
+                        float apg_parallel   = dot * positive_data[i];
+                        float apg_orthogonal = deltas[i] - apg_parallel;
 
-                    // tweak deltas
-                    deltas[i] = apg_orthogonal + apg_params.eta * apg_parallel;
+                        // tweak deltas
+                        deltas[i] = apg_orthogonal + apg_params.eta * apg_parallel;
+                    }
                 }
             }
 

From 6baa3a651dc331842f1bca40c520cba491b4cb4c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Wed, 12 Feb 2025 03:34:47 +0100
Subject: [PATCH 6/8] Fix CI build

---
 stable-diffusion.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index be8dc2c6..71c6ce6d 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -980,7 +980,7 @@ class StableDiffusionGGML {
                     deltas[i] = delta;
                 }
                 if (apg_params.norm_treshold > 0) {
-                    diff_norm        = std::sqrtf(diff_norm);
+                    diff_norm        = sqrtf(diff_norm);
                     apg_scale_factor = std::min(1.0f, apg_params.norm_treshold / diff_norm);
                 }
                 if (apg_params.eta != 1.0f) {

From fb44a8855a65423c5090b0630a5449dab2654800 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Wed, 12 Feb 2025 16:53:23 +0100
Subject: [PATCH 7/8] apg: add experimental threshold smoothing parameter

---
 examples/cli/main.cpp | 18 ++++++++++++++++--
 stable-diffusion.cpp  | 12 +++++++++---
 stable-diffusion.h    |  1 +
 3 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index f34c4e1d..fa7175ad 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -133,6 +133,7 @@ struct SDParams {
     float apg_eta            = 1.0f;
     float apg_momentum       = 0.0f;
     float apg_norm_threshold = 0.0f;
+    float apg_norm_smoothing = 0.0f;
 };
 
 void print_params(SDParams params) {
@@ -220,6 +221,8 @@ void print_usage(int argc, const char* argv[]) {
     printf("  --apg-eta VALUE                    parallel projected guidance scale for APG (default: 1.0, recommended: between 0 and 1)\n");
     printf("  --apg-momentum VALUE               CFG update direction momentum for APG (default: 0, recommended: around -0.5)\n");
     printf("  --apg-nt, --apg-rescale VALUE      CFG update direction norm threshold for APG (default: 0 = disabled, recommended: 4-15)\n");
+    printf("  --apg-nt-smoothing VALUE           EXPERIMENTAL! Norm threshold smoothing for APG (default: 0 = disabled)\n");
+    printf("                                     (replaces saturation with a smooth approximation)\n");
     printf("  --slg-scale SCALE                  skip layer guidance (SLG) scale, only for DiT models: (default: 0)\n");
     printf("                                     0 means disabled, a value of 2.5 is nice for sd3.5 medium\n");
     printf("  --eta SCALE                        eta in DDIM, only for DDIM and TCD: (default: 0)\n");
@@ -654,6 +657,12 @@ void parse_args(int argc, const char** argv, SDParams& params) {
                 break;
             }
             params.apg_norm_threshold = std::stof(argv[i]);
+        } else if (arg == "--apg-nt-smoothing") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.apg_norm_smoothing = std::stof(argv[i]);
         } else {
             fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
             print_usage(argc, argv);
@@ -752,6 +761,9 @@ std::string get_image_params(SDParams params, int64_t seed) {
     }
     if (params.apg_norm_threshold != 0) {
         parameter_string += "CFG normalization threshold: " + std::to_string(params.apg_norm_threshold) + ", ";
+        if (params.apg_norm_smoothing != 0) {
+            parameter_string += "CFG normalization threshold: " + std::to_string(params.apg_norm_smoothing) + ", ";
+        }
     }
     if (params.slg_scale != 0 && params.skip_layers.size() != 0) {
         parameter_string += "SLG scale: " + std::to_string(params.cfg_scale) + ", ";
@@ -1004,7 +1016,8 @@ int main(int argc, const char* argv[]) {
                                           params.skip_layer_end},
                           sd_apg_params_t{params.apg_eta,
                                           params.apg_momentum,
-                                          params.apg_norm_threshold});
+                                          params.apg_norm_threshold,
+                                          params.apg_norm_smoothing});
     } else {
         sd_image_t input_image = {(uint32_t)params.width,
                                   (uint32_t)params.height,
@@ -1076,7 +1089,8 @@ int main(int argc, const char* argv[]) {
                                               params.skip_layer_end},
                               sd_apg_params_t{params.apg_eta,
                                               params.apg_momentum,
-                                              params.apg_norm_threshold});
+                                              params.apg_norm_threshold,
+                                              params.apg_norm_smoothing});
         }
     }
 
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 71c6ce6d..3ed10bc2 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -801,7 +801,7 @@ class StableDiffusionGGML {
                         int start_merge_step,
                         SDCondition id_cond,
                         sd_slg_params_t slg_params = {NULL, 0, 0, 0, 0},
-                        sd_apg_params_t apg_params = {1, 0, 0},
+                        sd_apg_params_t apg_params = {1, 0, 0, 0},
                         ggml_tensor* noise_mask    = nullptr) {
         std::vector<int> skip_layers(slg_params.skip_layers, slg_params.skip_layers + slg_params.skip_layers_count);
 
@@ -980,8 +980,14 @@ class StableDiffusionGGML {
                     deltas[i] = delta;
                 }
                 if (apg_params.norm_treshold > 0) {
-                    diff_norm        = sqrtf(diff_norm);
-                    apg_scale_factor = std::min(1.0f, apg_params.norm_treshold / diff_norm);
+                    diff_norm = sqrtf(diff_norm);
+                    if (apg_params.norm_treshold_smoothing <= 0) {
+                        apg_scale_factor = std::min(1.0f, apg_params.norm_treshold / diff_norm);
+                    } else {
+                        // Experimental: smooth saturate
+                        float x          = apg_params.norm_treshold / diff_norm;
+                        apg_scale_factor = x / std::pow(1 + std::pow(x, 1.0 / apg_params.norm_treshold_smoothing), apg_params.norm_treshold_smoothing);
+                    }
                 }
                 if (apg_params.eta != 1.0f) {
                     dot *= apg_scale_factor;
diff --git a/stable-diffusion.h b/stable-diffusion.h
index e367d7dd..087102e7 100644
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@@ -131,6 +131,7 @@ typedef struct {
     float eta;
     float momentum;
     float norm_treshold;
+    float norm_treshold_smoothing;
 } sd_apg_params_t;
 
 typedef struct {

From 8408ee152b4ab2117b4d3e5fcb4ae8b1e7f0a4fc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Fri, 14 Mar 2025 00:24:20 +0100
Subject: [PATCH 8/8] add uncond slg variant

fix default slg params
---
 examples/cli/main.cpp | 15 +++++++++---
 stable-diffusion.cpp  | 55 +++++++++++++++++++++++++++++--------------
 stable-diffusion.h    |  1 +
 3 files changed, 50 insertions(+), 21 deletions(-)

diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index fa7175ad..c1b04e88 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -129,6 +129,7 @@ struct SDParams {
     float slg_scale              = 0.0f;
     float skip_layer_start       = 0.01f;
     float skip_layer_end         = 0.2f;
+    bool slg_uncond              = false;
 
     float apg_eta            = 1.0f;
     float apg_momentum       = 0.0f;
@@ -225,11 +226,14 @@ void print_usage(int argc, const char* argv[]) {
     printf("                                     (replaces saturation with a smooth approximation)\n");
     printf("  --slg-scale SCALE                  skip layer guidance (SLG) scale, only for DiT models: (default: 0)\n");
     printf("                                     0 means disabled, a value of 2.5 is nice for sd3.5 medium\n");
-    printf("  --eta SCALE                        eta in DDIM, only for DDIM and TCD: (default: 0)\n");
+    printf("  --slg-uncond                       Use CFG's forward pass for SLG instead of a separate pass, only for DiT models\n");
+    printf("                                     To use this, it's recommended to keep slg-scale to 0, both for performance and quality reasons\n");
+    printf("                                     This should be slightly faster than normal cfg when cfg_scale != 1.\n");
     printf("  --skip-layers LAYERS               Layers to skip for SLG steps: (default: [7,8,9])\n");
     printf("  --skip-layer-start START           SLG enabling point: (default: 0.01)\n");
     printf("  --skip-layer-end END               SLG disabling point: (default: 0.2)\n");
     printf("                                     SLG will be enabled at step int([STEPS]*[START]) and disabled at int([STEPS]*[END])\n");
+    printf("  --eta SCALE                        eta in DDIM, only for DDIM and TCD: (default: 0)\n");
     printf("  --strength STRENGTH                strength for noising/unnoising (default: 0.75)\n");
     printf("  --style-ratio STYLE-RATIO          strength for keeping input identity (default: 20%%)\n");
     printf("  --control-strength STRENGTH        strength to apply Control Net (default: 0.9)\n");
@@ -590,6 +594,8 @@ void parse_args(int argc, const char** argv, SDParams& params) {
                 break;
             }
             params.slg_scale = std::stof(argv[i]);
+        } else if (arg == "--slg-uncond") {
+            params.slg_uncond = true;
         } else if (arg == "--skip-layers") {
             if (++i >= argc) {
                 invalid_arg = true;
@@ -766,6 +772,7 @@ std::string get_image_params(SDParams params, int64_t seed) {
         }
     }
     if (params.slg_scale != 0 && params.skip_layers.size() != 0) {
+        parameter_string += "Unconditional SLG: " + std::string(params.slg_uncond ? "True" : "False") + ", ";
         parameter_string += "SLG scale: " + std::to_string(params.cfg_scale) + ", ";
         parameter_string += "Skip layers: [";
         for (const auto& layer : params.skip_layers) {
@@ -1013,7 +1020,8 @@ int main(int argc, const char* argv[]) {
                                           params.skip_layers.size(),
                                           params.slg_scale,
                                           params.skip_layer_start,
-                                          params.skip_layer_end},
+                                          params.skip_layer_end,
+                                          params.slg_uncond},
                           sd_apg_params_t{params.apg_eta,
                                           params.apg_momentum,
                                           params.apg_norm_threshold,
@@ -1086,7 +1094,8 @@ int main(int argc, const char* argv[]) {
                                               params.skip_layers.size(),
                                               params.slg_scale,
                                               params.skip_layer_start,
-                                              params.skip_layer_end},
+                                              params.skip_layer_end,
+                                              params.slg_uncond},
                               sd_apg_params_t{params.apg_eta,
                                               params.apg_momentum,
                                               params.apg_norm_threshold,
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 3ed10bc2..cb79197e 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -800,7 +800,7 @@ class StableDiffusionGGML {
                         const std::vector<float>& sigmas,
                         int start_merge_step,
                         SDCondition id_cond,
-                        sd_slg_params_t slg_params = {NULL, 0, 0, 0, 0},
+                        sd_slg_params_t slg_params = {NULL, 0, 0, 0, 0, false},
                         sd_apg_params_t apg_params = {1, 0, 0, 0},
                         ggml_tensor* noise_mask    = nullptr) {
         std::vector<int> skip_layers(slg_params.skip_layers, slg_params.skip_layers + slg_params.skip_layers_count);
@@ -827,7 +827,7 @@ class StableDiffusionGGML {
         struct ggml_tensor* noised_input = ggml_dup_tensor(work_ctx, noise);
 
         bool has_unconditioned = cfg_scale != 1.0 && uncond.c_crossattn != NULL;
-        bool has_skiplayer     = slg_params.scale != 0.0 && skip_layers.size() > 0;
+        bool has_skiplayer     = (slg_params.scale != 0.0 || slg_params.slg_uncond) && skip_layers.size() > 0;
 
         // denoise wrapper
         struct ggml_tensor* out_cond   = ggml_dup_tensor(work_ctx, x);
@@ -839,7 +839,9 @@ class StableDiffusionGGML {
         }
         if (has_skiplayer) {
             if (sd_version_is_dit(version)) {
-                out_skip = ggml_dup_tensor(work_ctx, x);
+                if (slg_params.scale != 0.0) {
+                    out_skip = ggml_dup_tensor(work_ctx, x);
+                }
             } else {
                 has_skiplayer = false;
                 LOG_WARN("SLG is incompatible with %s models", model_version_to_str[version]);
@@ -908,6 +910,8 @@ class StableDiffusionGGML {
                                          control_strength,
                                          &out_cond);
             }
+            int step_count         = sigmas.size();
+            bool is_skiplayer_step = has_skiplayer && step > (int)(slg_params.skip_layer_start * step_count) && step < (int)(slg_params.skip_layer_end * step_count);
 
             float* negative_data = NULL;
             if (has_unconditioned) {
@@ -916,24 +920,39 @@ class StableDiffusionGGML {
                     control_net->compute(n_threads, noised_input, control_hint, timesteps, uncond.c_crossattn, uncond.c_vector);
                     controls = control_net->controls;
                 }
-                diffusion_model->compute(n_threads,
-                                         noised_input,
-                                         timesteps,
-                                         uncond.c_crossattn,
-                                         uncond.c_concat,
-                                         uncond.c_vector,
-                                         guidance_tensor,
-                                         -1,
-                                         controls,
-                                         control_strength,
-                                         &out_uncond);
+                if (is_skiplayer_step && slg_params.slg_uncond) {
+                    LOG_DEBUG("Skipping layers at uncond step %d\n", step);
+                    diffusion_model->compute(n_threads,
+                                             noised_input,
+                                             timesteps,
+                                             uncond.c_crossattn,
+                                             uncond.c_concat,
+                                             uncond.c_vector,
+                                             guidance_tensor,
+                                             -1,
+                                             controls,
+                                             control_strength,
+                                             &out_uncond,
+                                             NULL,
+                                             skip_layers);
+                } else {
+                    diffusion_model->compute(n_threads,
+                                             noised_input,
+                                             timesteps,
+                                             uncond.c_crossattn,
+                                             uncond.c_concat,
+                                             uncond.c_vector,
+                                             guidance_tensor,
+                                             -1,
+                                             controls,
+                                             control_strength,
+                                             &out_uncond);
+                }
                 negative_data = (float*)out_uncond->data;
             }
 
-            int step_count         = sigmas.size();
-            bool is_skiplayer_step = has_skiplayer && step > (int)(slg_params.skip_layer_start * step_count) && step < (int)(slg_params.skip_layer_end * step_count);
             float* skip_layer_data = NULL;
-            if (is_skiplayer_step) {
+            if (is_skiplayer_step && slg_params.scale != 0.0) {
                 LOG_DEBUG("Skipping layers at step %d\n", step);
                 // skip layer (same as conditionned)
                 diffusion_model->compute(n_threads,
@@ -1021,7 +1040,7 @@ class StableDiffusionGGML {
                         latent_result = positive_data[i] + (cfg_scale - 1) * delta;
                     }
                 }
-                if (is_skiplayer_step) {
+                if (is_skiplayer_step && slg_params.scale != 0.0) {
                     latent_result = latent_result + (positive_data[i] - skip_layer_data[i]) * slg_params.scale;
                 }
                 // v = latent_result, eps = latent_result
diff --git a/stable-diffusion.h b/stable-diffusion.h
index 087102e7..c05721da 100644
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@@ -140,6 +140,7 @@ typedef struct {
     float scale;
     float skip_layer_start;
     float skip_layer_end;
+    bool slg_uncond;
 } sd_slg_params_t;
 
 typedef struct sd_ctx_t sd_ctx_t;