From a5dbce592790118582d8f41b2b710a153ae524fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Wed, 12 Feb 2025 00:15:59 +0100 Subject: [PATCH 1/8] apg: first implementation --- stable-diffusion.cpp | 60 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 58 insertions(+), 2 deletions(-) diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index e38a6101..30754cf6 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -847,6 +847,15 @@ class StableDiffusionGGML { } struct ggml_tensor* denoised = ggml_dup_tensor(work_ctx, x); + // TODO do not hardcode + float apg_eta = .08f; + float apg_momentum = -.5f; + float apg_norm_treshold = 15.0f; + + std::vector apg_momentum_buffer; + if (apg_momentum != 0) + apg_momentum_buffer.resize((size_t)ggml_nelements(denoised)); + auto denoise = [&](ggml_tensor* input, float sigma, int step) -> ggml_tensor* { if (step == 1) { pretty_progress(0, (int)steps, 0); @@ -951,6 +960,50 @@ class StableDiffusionGGML { float* vec_input = (float*)input->data; float* positive_data = (float*)out_cond->data; int ne_elements = (int)ggml_nelements(denoised); + + float* deltas = vec_denoised; + + // https://arxiv.org/pdf/2410.02416 + float apg_scale_factor = 1.; + float diff_norm = 0; + float cond_norm_sq = 0; + float dot = 0; + for (int i = 0; i < ne_elements; i++) { + float delta = positive_data[i] - negative_data[i]; + if (apg_momentum != 0) { + delta += apg_momentum * apg_momentum_buffer[i]; + apg_momentum_buffer[i] = delta; + } + if (apg_norm_treshold > 0) { + diff_norm += delta * delta; + } + if (apg_eta != 1.0f) { + cond_norm_sq += positive_data[i] * positive_data[i]; + dot += positive_data[i] * delta; + } + deltas[i] = delta; + } + if (apg_norm_treshold > 0) { + diff_norm = std::sqrtf(diff_norm); + apg_scale_factor = std::min(1.0f, apg_norm_treshold / diff_norm); + } + if (apg_eta != 1.0f) { + dot *= apg_scale_factor; + // pre-normalize (avoids one square root and ne_elements extra divs) + dot /= cond_norm_sq; + } + + for (int i = 0; i < ne_elements; i++) { + deltas[i] *= apg_scale_factor; + if (apg_eta != 1.0f) { + float apg_parallel = dot * positive_data[i]; + float apg_orthogonal = deltas[i] - apg_parallel; + + // tweak deltas + deltas[i] = apg_orthogonal + apg_eta * apg_parallel; + } + } + for (int i = 0; i < ne_elements; i++) { float latent_result = positive_data[i]; if (has_unconditioned) { @@ -960,7 +1013,9 @@ class StableDiffusionGGML { int64_t i3 = i / out_cond->ne[0] * out_cond->ne[1] * out_cond->ne[2]; float scale = min_cfg + (cfg_scale - min_cfg) * (i3 * 1.0f / ne3); } else { - latent_result = negative_data[i] + cfg_scale * (positive_data[i] - negative_data[i]); + float delta = deltas[i]; + + latent_result = positive_data[i] + (cfg_scale - 1) * delta; } } if (is_skiplayer_step) { @@ -1004,7 +1059,8 @@ class StableDiffusionGGML { } // ldm.models.diffusion.ddpm.LatentDiffusion.get_first_stage_encoding - ggml_tensor* get_first_stage_encoding(ggml_context* work_ctx, ggml_tensor* moments) { + ggml_tensor* + get_first_stage_encoding(ggml_context* work_ctx, ggml_tensor* moments) { // ldm.modules.distributions.distributions.DiagonalGaussianDistribution.sample ggml_tensor* latent = ggml_new_tensor_4d(work_ctx, moments->type, moments->ne[0], moments->ne[1], moments->ne[2] / 2, moments->ne[3]); struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, latent); From 02114c2e9a60ee45f075509da28863b6feace192 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Wed, 12 Feb 2025 01:04:40 +0100 Subject: [PATCH 2/8] refactor guidance params in lib --- examples/cli/main.cpp | 32 +++++++++-------- stable-diffusion.cpp | 83 ++++++++++++++++--------------------------- stable-diffusion.h | 28 +++++++++------ 3 files changed, 66 insertions(+), 77 deletions(-) diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index af6b2bbd..8a9114f0 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -963,11 +963,12 @@ int main(int argc, const char* argv[]) { params.style_ratio, params.normalize_input, params.input_id_images_path.c_str(), - params.skip_layers.data(), - params.skip_layers.size(), - params.slg_scale, - params.skip_layer_start, - params.skip_layer_end); + sd_slg_params_t{params.skip_layers.data(), + params.skip_layers.size(), + params.slg_scale, + params.skip_layer_start, + params.skip_layer_end}, + sd_apg_params_t{1, 0, 0}); } else { sd_image_t input_image = {(uint32_t)params.width, (uint32_t)params.height, @@ -1032,11 +1033,12 @@ int main(int argc, const char* argv[]) { params.style_ratio, params.normalize_input, params.input_id_images_path.c_str(), - params.skip_layers.data(), - params.skip_layers.size(), - params.slg_scale, - params.skip_layer_start, - params.skip_layer_end); + sd_slg_params_t{params.skip_layers.data(), + params.skip_layers.size(), + params.slg_scale, + params.skip_layer_start, + params.skip_layer_end}, + sd_apg_params_t{1, 0, 0}); } } @@ -1075,11 +1077,11 @@ int main(int argc, const char* argv[]) { std::string dummy_name, ext, lc_ext; bool is_jpg; - size_t last = params.output_path.find_last_of("."); + size_t last = params.output_path.find_last_of("."); size_t last_path = std::min(params.output_path.find_last_of("/"), params.output_path.find_last_of("\\")); - if (last != std::string::npos // filename has extension - && (last_path == std::string::npos || last > last_path)) { + if (last != std::string::npos // filename has extension + && (last_path == std::string::npos || last > last_path)) { dummy_name = params.output_path.substr(0, last); ext = lc_ext = params.output_path.substr(last); std::transform(ext.begin(), ext.end(), lc_ext.begin(), ::tolower); @@ -1087,7 +1089,7 @@ int main(int argc, const char* argv[]) { } else { dummy_name = params.output_path; ext = lc_ext = ""; - is_jpg = false; + is_jpg = false; } // appending ".png" to absent or unknown extension if (!is_jpg && lc_ext != ".png") { @@ -1099,7 +1101,7 @@ int main(int argc, const char* argv[]) { continue; } std::string final_image_path = i > 0 ? dummy_name + "_" + std::to_string(i + 1) + ext : dummy_name + ext; - if(is_jpg) { + if (is_jpg) { stbi_write_jpg(final_image_path.c_str(), results[i].width, results[i].height, results[i].channel, results[i].data, 90, get_image_params(params, params.seed + i).c_str()); printf("save result JPEG image to '%s'\n", final_image_path.c_str()); diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 30754cf6..bc5bdec0 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -800,11 +800,11 @@ class StableDiffusionGGML { const std::vector& sigmas, int start_merge_step, SDCondition id_cond, - std::vector skip_layers = {}, - float slg_scale = 0, - float skip_layer_start = 0.01, - float skip_layer_end = 0.2, - ggml_tensor* noise_mask = nullptr) { + sd_slg_params_t slg_params = {NULL, 0, 0, 0, 0}, + sd_apg_params_t apg_params = {1, 0, 0}, + ggml_tensor* noise_mask = nullptr) { + std::vector skip_layers(slg_params.skip_layers, slg_params.skip_layers + slg_params.skip_layers_count); + LOG_DEBUG("Sample"); struct ggml_init_params params; size_t data_size = ggml_row_size(init_latent->type, init_latent->ne[0]); @@ -827,7 +827,7 @@ class StableDiffusionGGML { struct ggml_tensor* noised_input = ggml_dup_tensor(work_ctx, noise); bool has_unconditioned = cfg_scale != 1.0 && uncond.c_crossattn != NULL; - bool has_skiplayer = slg_scale != 0.0 && skip_layers.size() > 0; + bool has_skiplayer = slg_params.scale != 0.0 && skip_layers.size() > 0; // denoise wrapper struct ggml_tensor* out_cond = ggml_dup_tensor(work_ctx, x); @@ -847,13 +847,8 @@ class StableDiffusionGGML { } struct ggml_tensor* denoised = ggml_dup_tensor(work_ctx, x); - // TODO do not hardcode - float apg_eta = .08f; - float apg_momentum = -.5f; - float apg_norm_treshold = 15.0f; - std::vector apg_momentum_buffer; - if (apg_momentum != 0) + if (apg_params.momentum != 0) apg_momentum_buffer.resize((size_t)ggml_nelements(denoised)); auto denoise = [&](ggml_tensor* input, float sigma, int step) -> ggml_tensor* { @@ -936,7 +931,7 @@ class StableDiffusionGGML { } int step_count = sigmas.size(); - bool is_skiplayer_step = has_skiplayer && step > (int)(skip_layer_start * step_count) && step < (int)(skip_layer_end * step_count); + bool is_skiplayer_step = has_skiplayer && step > (int)(slg_params.skip_layer_start * step_count) && step < (int)(slg_params.skip_layer_end * step_count); float* skip_layer_data = NULL; if (is_skiplayer_step) { LOG_DEBUG("Skipping layers at step %d\n", step); @@ -970,24 +965,24 @@ class StableDiffusionGGML { float dot = 0; for (int i = 0; i < ne_elements; i++) { float delta = positive_data[i] - negative_data[i]; - if (apg_momentum != 0) { - delta += apg_momentum * apg_momentum_buffer[i]; + if (apg_params.momentum != 0) { + delta += apg_params.momentum * apg_momentum_buffer[i]; apg_momentum_buffer[i] = delta; } - if (apg_norm_treshold > 0) { + if (apg_params.norm_treshold > 0) { diff_norm += delta * delta; } - if (apg_eta != 1.0f) { + if (apg_params.eta != 1.0f) { cond_norm_sq += positive_data[i] * positive_data[i]; dot += positive_data[i] * delta; } deltas[i] = delta; } - if (apg_norm_treshold > 0) { + if (apg_params.norm_treshold > 0) { diff_norm = std::sqrtf(diff_norm); - apg_scale_factor = std::min(1.0f, apg_norm_treshold / diff_norm); + apg_scale_factor = std::min(1.0f, apg_params.norm_treshold / diff_norm); } - if (apg_eta != 1.0f) { + if (apg_params.eta != 1.0f) { dot *= apg_scale_factor; // pre-normalize (avoids one square root and ne_elements extra divs) dot /= cond_norm_sq; @@ -995,12 +990,12 @@ class StableDiffusionGGML { for (int i = 0; i < ne_elements; i++) { deltas[i] *= apg_scale_factor; - if (apg_eta != 1.0f) { + if (apg_params.eta != 1.0f) { float apg_parallel = dot * positive_data[i]; float apg_orthogonal = deltas[i] - apg_parallel; // tweak deltas - deltas[i] = apg_orthogonal + apg_eta * apg_parallel; + deltas[i] = apg_orthogonal + apg_params.eta * apg_parallel; } } @@ -1019,7 +1014,7 @@ class StableDiffusionGGML { } } if (is_skiplayer_step) { - latent_result = latent_result + (positive_data[i] - skip_layer_data[i]) * slg_scale; + latent_result = latent_result + (positive_data[i] - skip_layer_data[i]) * slg_params.scale; } // v = latent_result, eps = latent_result // denoised = (v * c_out + input * c_skip) or (input + eps * c_out) @@ -1265,11 +1260,9 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, float style_ratio, bool normalize_input, std::string input_id_images_path, - std::vector skip_layers = {}, - float slg_scale = 0, - float skip_layer_start = 0.01, - float skip_layer_end = 0.2, - ggml_tensor* masked_image = NULL) { + sd_slg_params_t slg_params, + sd_apg_params_t apg_params, + ggml_tensor* masked_image = NULL) { if (seed < 0) { // Generally, when using the provided command line, the seed is always >0. // However, to prevent potential issues if 'stable-diffusion.cpp' is invoked as a library @@ -1522,10 +1515,8 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, sigmas, start_merge_step, id_cond, - skip_layers, - slg_scale, - skip_layer_start, - skip_layer_end, + slg_params, + apg_params, noise_mask); // struct ggml_tensor* x_0 = load_tensor_from_file(ctx, "samples_ddim.bin"); @@ -1595,12 +1586,8 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx, float style_ratio, bool normalize_input, const char* input_id_images_path_c_str, - int* skip_layers = NULL, - size_t skip_layers_count = 0, - float slg_scale = 0, - float skip_layer_start = 0.01, - float skip_layer_end = 0.2) { - std::vector skip_layers_vec(skip_layers, skip_layers + skip_layers_count); + sd_slg_params_t slg_params, + sd_apg_params_t apg_params) { LOG_DEBUG("txt2img %dx%d", width, height); if (sd_ctx == NULL) { return NULL; @@ -1674,10 +1661,8 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx, style_ratio, normalize_input, input_id_images_path_c_str, - skip_layers_vec, - slg_scale, - skip_layer_start, - skip_layer_end); + slg_params, + apg_params); size_t t1 = ggml_time_ms(); @@ -1707,12 +1692,8 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, float style_ratio, bool normalize_input, const char* input_id_images_path_c_str, - int* skip_layers = NULL, - size_t skip_layers_count = 0, - float slg_scale = 0, - float skip_layer_start = 0.01, - float skip_layer_end = 0.2) { - std::vector skip_layers_vec(skip_layers, skip_layers + skip_layers_count); + sd_slg_params_t slg_params, + sd_apg_params_t apg_params) { LOG_DEBUG("img2img %dx%d", width, height); if (sd_ctx == NULL) { return NULL; @@ -1854,10 +1835,8 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, style_ratio, normalize_input, input_id_images_path_c_str, - skip_layers_vec, - slg_scale, - skip_layer_start, - skip_layer_end, + slg_params, + apg_params, masked_image); size_t t2 = ggml_time_ms(); diff --git a/stable-diffusion.h b/stable-diffusion.h index 52dcc848..e367d7dd 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -127,6 +127,20 @@ typedef struct { uint8_t* data; } sd_image_t; +typedef struct { + float eta; + float momentum; + float norm_treshold; +} sd_apg_params_t; + +typedef struct { + int* skip_layers; + size_t skip_layers_count; + float scale; + float skip_layer_start; + float skip_layer_end; +} sd_slg_params_t; + typedef struct sd_ctx_t sd_ctx_t; SD_API sd_ctx_t* new_sd_ctx(const char* model_path, @@ -172,11 +186,8 @@ SD_API sd_image_t* txt2img(sd_ctx_t* sd_ctx, float style_strength, bool normalize_input, const char* input_id_images_path, - int* skip_layers, - size_t skip_layers_count, - float slg_scale, - float skip_layer_start, - float skip_layer_end); + sd_slg_params_t slg_params, + sd_apg_params_t apg_params); SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx, sd_image_t init_image, @@ -199,11 +210,8 @@ SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx, float style_strength, bool normalize_input, const char* input_id_images_path, - int* skip_layers, - size_t skip_layers_count, - float slg_scale, - float skip_layer_start, - float skip_layer_end); + sd_slg_params_t slg_params, + sd_apg_params_t apg_params); SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx, sd_image_t init_image, From 98e056b6e2054760622513518b6c47fc56fc9c59 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Wed, 12 Feb 2025 01:38:05 +0100 Subject: [PATCH 3/8] main: add apg support --- examples/cli/main.cpp | 35 ++++++++++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index 8a9114f0..4edf5bc7 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -126,9 +126,13 @@ struct SDParams { int upscale_repeats = 1; std::vector skip_layers = {7, 8, 9}; - float slg_scale = 0.f; + float slg_scale = 0.0f; float skip_layer_start = 0.01f; float skip_layer_end = 0.2f; + + float apg_eta = 1.0f; + float apg_momentum = 0.0f; + float apg_norm_treshold = 0.0f; }; void print_params(SDParams params) { @@ -213,6 +217,9 @@ void print_usage(int argc, const char* argv[]) { printf(" -n, --negative-prompt PROMPT the negative prompt (default: \"\")\n"); printf(" --cfg-scale SCALE unconditional guidance scale: (default: 7.0)\n"); printf(" --guidance SCALE guidance scale for img2img (default: 3.5)\n"); + printf(" --apg-eta VALUE parallel projected guidance scale for APG (default: 1.0, recommended: between 0 and 1)\n"); + printf(" --apg-momentum VALUE CFG update direction momentum for APG (default: 0, recommended: around -0.5)\n"); + printf(" --apg-nt, --apg-rescale VALUE CFG update direction norm threshold for APG (default: 0 = disabled, recommended: 4-15)\n"); printf(" --slg-scale SCALE skip layer guidance (SLG) scale, only for DiT models: (default: 0)\n"); printf(" 0 means disabled, a value of 2.5 is nice for sd3.5 medium\n"); printf(" --eta SCALE eta in DDIM, only for DDIM and TCD: (default: 0)\n"); @@ -629,6 +636,24 @@ void parse_args(int argc, const char** argv, SDParams& params) { break; } params.skip_layer_end = std::stof(argv[i]); + } else if (arg == "--apg-eta") { + if (++i >= argc) { + invalid_arg = true; + break; + } + params.apg_eta = std::stof(argv[i]); + } else if (arg == "--apg-momentum") { + if (++i >= argc) { + invalid_arg = true; + break; + } + params.apg_momentum = std::stof(argv[i]); + } else if (arg == "--apg-nt" || arg == "--apg-rescale") { + if (++i >= argc) { + invalid_arg = true; + break; + } + params.apg_norm_treshold = std::stof(argv[i]); } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); print_usage(argc, argv); @@ -968,7 +993,9 @@ int main(int argc, const char* argv[]) { params.slg_scale, params.skip_layer_start, params.skip_layer_end}, - sd_apg_params_t{1, 0, 0}); + sd_apg_params_t{params.apg_eta, + params.apg_momentum, + params.apg_norm_treshold}); } else { sd_image_t input_image = {(uint32_t)params.width, (uint32_t)params.height, @@ -1038,7 +1065,9 @@ int main(int argc, const char* argv[]) { params.slg_scale, params.skip_layer_start, params.skip_layer_end}, - sd_apg_params_t{1, 0, 0}); + sd_apg_params_t{params.apg_eta, + params.apg_momentum, + params.apg_norm_treshold}); } } From e64b3b853a563774e7a0f4c5c7d0c2e63cab3a8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Wed, 12 Feb 2025 02:17:45 +0100 Subject: [PATCH 4/8] add apg settings to image params --- examples/cli/main.cpp | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index 4edf5bc7..f34c4e1d 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -130,9 +130,9 @@ struct SDParams { float skip_layer_start = 0.01f; float skip_layer_end = 0.2f; - float apg_eta = 1.0f; - float apg_momentum = 0.0f; - float apg_norm_treshold = 0.0f; + float apg_eta = 1.0f; + float apg_momentum = 0.0f; + float apg_norm_threshold = 0.0f; }; void print_params(SDParams params) { @@ -653,7 +653,7 @@ void parse_args(int argc, const char** argv, SDParams& params) { invalid_arg = true; break; } - params.apg_norm_treshold = std::stof(argv[i]); + params.apg_norm_threshold = std::stof(argv[i]); } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); print_usage(argc, argv); @@ -744,6 +744,15 @@ std::string get_image_params(SDParams params, int64_t seed) { } parameter_string += "Steps: " + std::to_string(params.sample_steps) + ", "; parameter_string += "CFG scale: " + std::to_string(params.cfg_scale) + ", "; + if (params.apg_eta != 1) { + parameter_string += "APG eta: " + std::to_string(params.apg_eta) + ", "; + } + if (params.apg_momentum != 0) { + parameter_string += "CFG momentum: " + std::to_string(params.apg_momentum) + ", "; + } + if (params.apg_norm_threshold != 0) { + parameter_string += "CFG normalization threshold: " + std::to_string(params.apg_norm_threshold) + ", "; + } if (params.slg_scale != 0 && params.skip_layers.size() != 0) { parameter_string += "SLG scale: " + std::to_string(params.cfg_scale) + ", "; parameter_string += "Skip layers: ["; @@ -995,7 +1004,7 @@ int main(int argc, const char* argv[]) { params.skip_layer_end}, sd_apg_params_t{params.apg_eta, params.apg_momentum, - params.apg_norm_treshold}); + params.apg_norm_threshold}); } else { sd_image_t input_image = {(uint32_t)params.width, (uint32_t)params.height, @@ -1067,7 +1076,7 @@ int main(int argc, const char* argv[]) { params.skip_layer_end}, sd_apg_params_t{params.apg_eta, params.apg_momentum, - params.apg_norm_treshold}); + params.apg_norm_threshold}); } } From 98064d0f7bd759b46d0ce6b0efc32405e5a4f3c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Wed, 12 Feb 2025 03:08:58 +0100 Subject: [PATCH 5/8] Fix cfg 1 crash --- stable-diffusion.cpp | 56 +++++++++++++++++++++++--------------------- 1 file changed, 29 insertions(+), 27 deletions(-) diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index bc5bdec0..be8dc2c6 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -802,7 +802,7 @@ class StableDiffusionGGML { SDCondition id_cond, sd_slg_params_t slg_params = {NULL, 0, 0, 0, 0}, sd_apg_params_t apg_params = {1, 0, 0}, - ggml_tensor* noise_mask = nullptr) { + ggml_tensor* noise_mask = nullptr) { std::vector skip_layers(slg_params.skip_layers, slg_params.skip_layers + slg_params.skip_layers_count); LOG_DEBUG("Sample"); @@ -963,39 +963,41 @@ class StableDiffusionGGML { float diff_norm = 0; float cond_norm_sq = 0; float dot = 0; - for (int i = 0; i < ne_elements; i++) { - float delta = positive_data[i] - negative_data[i]; - if (apg_params.momentum != 0) { - delta += apg_params.momentum * apg_momentum_buffer[i]; - apg_momentum_buffer[i] = delta; + if (has_unconditioned) { + for (int i = 0; i < ne_elements; i++) { + float delta = positive_data[i] - negative_data[i]; + if (apg_params.momentum != 0) { + delta += apg_params.momentum * apg_momentum_buffer[i]; + apg_momentum_buffer[i] = delta; + } + if (apg_params.norm_treshold > 0) { + diff_norm += delta * delta; + } + if (apg_params.eta != 1.0f) { + cond_norm_sq += positive_data[i] * positive_data[i]; + dot += positive_data[i] * delta; + } + deltas[i] = delta; } if (apg_params.norm_treshold > 0) { - diff_norm += delta * delta; + diff_norm = std::sqrtf(diff_norm); + apg_scale_factor = std::min(1.0f, apg_params.norm_treshold / diff_norm); } if (apg_params.eta != 1.0f) { - cond_norm_sq += positive_data[i] * positive_data[i]; - dot += positive_data[i] * delta; + dot *= apg_scale_factor; + // pre-normalize (avoids one square root and ne_elements extra divs) + dot /= cond_norm_sq; } - deltas[i] = delta; - } - if (apg_params.norm_treshold > 0) { - diff_norm = std::sqrtf(diff_norm); - apg_scale_factor = std::min(1.0f, apg_params.norm_treshold / diff_norm); - } - if (apg_params.eta != 1.0f) { - dot *= apg_scale_factor; - // pre-normalize (avoids one square root and ne_elements extra divs) - dot /= cond_norm_sq; - } - for (int i = 0; i < ne_elements; i++) { - deltas[i] *= apg_scale_factor; - if (apg_params.eta != 1.0f) { - float apg_parallel = dot * positive_data[i]; - float apg_orthogonal = deltas[i] - apg_parallel; + for (int i = 0; i < ne_elements; i++) { + deltas[i] *= apg_scale_factor; + if (apg_params.eta != 1.0f) { + float apg_parallel = dot * positive_data[i]; + float apg_orthogonal = deltas[i] - apg_parallel; - // tweak deltas - deltas[i] = apg_orthogonal + apg_params.eta * apg_parallel; + // tweak deltas + deltas[i] = apg_orthogonal + apg_params.eta * apg_parallel; + } } } From 6baa3a651dc331842f1bca40c520cba491b4cb4c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Wed, 12 Feb 2025 03:34:47 +0100 Subject: [PATCH 6/8] Fix CI build --- stable-diffusion.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index be8dc2c6..71c6ce6d 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -980,7 +980,7 @@ class StableDiffusionGGML { deltas[i] = delta; } if (apg_params.norm_treshold > 0) { - diff_norm = std::sqrtf(diff_norm); + diff_norm = sqrtf(diff_norm); apg_scale_factor = std::min(1.0f, apg_params.norm_treshold / diff_norm); } if (apg_params.eta != 1.0f) { From fb44a8855a65423c5090b0630a5449dab2654800 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Wed, 12 Feb 2025 16:53:23 +0100 Subject: [PATCH 7/8] apg: add experimental threshold smoothing parameter --- examples/cli/main.cpp | 18 ++++++++++++++++-- stable-diffusion.cpp | 12 +++++++++--- stable-diffusion.h | 1 + 3 files changed, 26 insertions(+), 5 deletions(-) diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index f34c4e1d..fa7175ad 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -133,6 +133,7 @@ struct SDParams { float apg_eta = 1.0f; float apg_momentum = 0.0f; float apg_norm_threshold = 0.0f; + float apg_norm_smoothing = 0.0f; }; void print_params(SDParams params) { @@ -220,6 +221,8 @@ void print_usage(int argc, const char* argv[]) { printf(" --apg-eta VALUE parallel projected guidance scale for APG (default: 1.0, recommended: between 0 and 1)\n"); printf(" --apg-momentum VALUE CFG update direction momentum for APG (default: 0, recommended: around -0.5)\n"); printf(" --apg-nt, --apg-rescale VALUE CFG update direction norm threshold for APG (default: 0 = disabled, recommended: 4-15)\n"); + printf(" --apg-nt-smoothing VALUE EXPERIMENTAL! Norm threshold smoothing for APG (default: 0 = disabled)\n"); + printf(" (replaces saturation with a smooth approximation)\n"); printf(" --slg-scale SCALE skip layer guidance (SLG) scale, only for DiT models: (default: 0)\n"); printf(" 0 means disabled, a value of 2.5 is nice for sd3.5 medium\n"); printf(" --eta SCALE eta in DDIM, only for DDIM and TCD: (default: 0)\n"); @@ -654,6 +657,12 @@ void parse_args(int argc, const char** argv, SDParams& params) { break; } params.apg_norm_threshold = std::stof(argv[i]); + } else if (arg == "--apg-nt-smoothing") { + if (++i >= argc) { + invalid_arg = true; + break; + } + params.apg_norm_smoothing = std::stof(argv[i]); } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); print_usage(argc, argv); @@ -752,6 +761,9 @@ std::string get_image_params(SDParams params, int64_t seed) { } if (params.apg_norm_threshold != 0) { parameter_string += "CFG normalization threshold: " + std::to_string(params.apg_norm_threshold) + ", "; + if (params.apg_norm_smoothing != 0) { + parameter_string += "CFG normalization threshold: " + std::to_string(params.apg_norm_smoothing) + ", "; + } } if (params.slg_scale != 0 && params.skip_layers.size() != 0) { parameter_string += "SLG scale: " + std::to_string(params.cfg_scale) + ", "; @@ -1004,7 +1016,8 @@ int main(int argc, const char* argv[]) { params.skip_layer_end}, sd_apg_params_t{params.apg_eta, params.apg_momentum, - params.apg_norm_threshold}); + params.apg_norm_threshold, + params.apg_norm_smoothing}); } else { sd_image_t input_image = {(uint32_t)params.width, (uint32_t)params.height, @@ -1076,7 +1089,8 @@ int main(int argc, const char* argv[]) { params.skip_layer_end}, sd_apg_params_t{params.apg_eta, params.apg_momentum, - params.apg_norm_threshold}); + params.apg_norm_threshold, + params.apg_norm_smoothing}); } } diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 71c6ce6d..3ed10bc2 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -801,7 +801,7 @@ class StableDiffusionGGML { int start_merge_step, SDCondition id_cond, sd_slg_params_t slg_params = {NULL, 0, 0, 0, 0}, - sd_apg_params_t apg_params = {1, 0, 0}, + sd_apg_params_t apg_params = {1, 0, 0, 0}, ggml_tensor* noise_mask = nullptr) { std::vector skip_layers(slg_params.skip_layers, slg_params.skip_layers + slg_params.skip_layers_count); @@ -980,8 +980,14 @@ class StableDiffusionGGML { deltas[i] = delta; } if (apg_params.norm_treshold > 0) { - diff_norm = sqrtf(diff_norm); - apg_scale_factor = std::min(1.0f, apg_params.norm_treshold / diff_norm); + diff_norm = sqrtf(diff_norm); + if (apg_params.norm_treshold_smoothing <= 0) { + apg_scale_factor = std::min(1.0f, apg_params.norm_treshold / diff_norm); + } else { + // Experimental: smooth saturate + float x = apg_params.norm_treshold / diff_norm; + apg_scale_factor = x / std::pow(1 + std::pow(x, 1.0 / apg_params.norm_treshold_smoothing), apg_params.norm_treshold_smoothing); + } } if (apg_params.eta != 1.0f) { dot *= apg_scale_factor; diff --git a/stable-diffusion.h b/stable-diffusion.h index e367d7dd..087102e7 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -131,6 +131,7 @@ typedef struct { float eta; float momentum; float norm_treshold; + float norm_treshold_smoothing; } sd_apg_params_t; typedef struct { From 8408ee152b4ab2117b4d3e5fcb4ae8b1e7f0a4fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Fri, 14 Mar 2025 00:24:20 +0100 Subject: [PATCH 8/8] add uncond slg variant fix default slg params --- examples/cli/main.cpp | 15 +++++++++--- stable-diffusion.cpp | 55 +++++++++++++++++++++++++++++-------------- stable-diffusion.h | 1 + 3 files changed, 50 insertions(+), 21 deletions(-) diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index fa7175ad..c1b04e88 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -129,6 +129,7 @@ struct SDParams { float slg_scale = 0.0f; float skip_layer_start = 0.01f; float skip_layer_end = 0.2f; + bool slg_uncond = false; float apg_eta = 1.0f; float apg_momentum = 0.0f; @@ -225,11 +226,14 @@ void print_usage(int argc, const char* argv[]) { printf(" (replaces saturation with a smooth approximation)\n"); printf(" --slg-scale SCALE skip layer guidance (SLG) scale, only for DiT models: (default: 0)\n"); printf(" 0 means disabled, a value of 2.5 is nice for sd3.5 medium\n"); - printf(" --eta SCALE eta in DDIM, only for DDIM and TCD: (default: 0)\n"); + printf(" --slg-uncond Use CFG's forward pass for SLG instead of a separate pass, only for DiT models\n"); + printf(" To use this, it's recommended to keep slg-scale to 0, both for performance and quality reasons\n"); + printf(" This should be slightly faster than normal cfg when cfg_scale != 1.\n"); printf(" --skip-layers LAYERS Layers to skip for SLG steps: (default: [7,8,9])\n"); printf(" --skip-layer-start START SLG enabling point: (default: 0.01)\n"); printf(" --skip-layer-end END SLG disabling point: (default: 0.2)\n"); printf(" SLG will be enabled at step int([STEPS]*[START]) and disabled at int([STEPS]*[END])\n"); + printf(" --eta SCALE eta in DDIM, only for DDIM and TCD: (default: 0)\n"); printf(" --strength STRENGTH strength for noising/unnoising (default: 0.75)\n"); printf(" --style-ratio STYLE-RATIO strength for keeping input identity (default: 20%%)\n"); printf(" --control-strength STRENGTH strength to apply Control Net (default: 0.9)\n"); @@ -590,6 +594,8 @@ void parse_args(int argc, const char** argv, SDParams& params) { break; } params.slg_scale = std::stof(argv[i]); + } else if (arg == "--slg-uncond") { + params.slg_uncond = true; } else if (arg == "--skip-layers") { if (++i >= argc) { invalid_arg = true; @@ -766,6 +772,7 @@ std::string get_image_params(SDParams params, int64_t seed) { } } if (params.slg_scale != 0 && params.skip_layers.size() != 0) { + parameter_string += "Unconditional SLG: " + std::string(params.slg_uncond ? "True" : "False") + ", "; parameter_string += "SLG scale: " + std::to_string(params.cfg_scale) + ", "; parameter_string += "Skip layers: ["; for (const auto& layer : params.skip_layers) { @@ -1013,7 +1020,8 @@ int main(int argc, const char* argv[]) { params.skip_layers.size(), params.slg_scale, params.skip_layer_start, - params.skip_layer_end}, + params.skip_layer_end, + params.slg_uncond}, sd_apg_params_t{params.apg_eta, params.apg_momentum, params.apg_norm_threshold, @@ -1086,7 +1094,8 @@ int main(int argc, const char* argv[]) { params.skip_layers.size(), params.slg_scale, params.skip_layer_start, - params.skip_layer_end}, + params.skip_layer_end, + params.slg_uncond}, sd_apg_params_t{params.apg_eta, params.apg_momentum, params.apg_norm_threshold, diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 3ed10bc2..cb79197e 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -800,7 +800,7 @@ class StableDiffusionGGML { const std::vector& sigmas, int start_merge_step, SDCondition id_cond, - sd_slg_params_t slg_params = {NULL, 0, 0, 0, 0}, + sd_slg_params_t slg_params = {NULL, 0, 0, 0, 0, false}, sd_apg_params_t apg_params = {1, 0, 0, 0}, ggml_tensor* noise_mask = nullptr) { std::vector skip_layers(slg_params.skip_layers, slg_params.skip_layers + slg_params.skip_layers_count); @@ -827,7 +827,7 @@ class StableDiffusionGGML { struct ggml_tensor* noised_input = ggml_dup_tensor(work_ctx, noise); bool has_unconditioned = cfg_scale != 1.0 && uncond.c_crossattn != NULL; - bool has_skiplayer = slg_params.scale != 0.0 && skip_layers.size() > 0; + bool has_skiplayer = (slg_params.scale != 0.0 || slg_params.slg_uncond) && skip_layers.size() > 0; // denoise wrapper struct ggml_tensor* out_cond = ggml_dup_tensor(work_ctx, x); @@ -839,7 +839,9 @@ class StableDiffusionGGML { } if (has_skiplayer) { if (sd_version_is_dit(version)) { - out_skip = ggml_dup_tensor(work_ctx, x); + if (slg_params.scale != 0.0) { + out_skip = ggml_dup_tensor(work_ctx, x); + } } else { has_skiplayer = false; LOG_WARN("SLG is incompatible with %s models", model_version_to_str[version]); @@ -908,6 +910,8 @@ class StableDiffusionGGML { control_strength, &out_cond); } + int step_count = sigmas.size(); + bool is_skiplayer_step = has_skiplayer && step > (int)(slg_params.skip_layer_start * step_count) && step < (int)(slg_params.skip_layer_end * step_count); float* negative_data = NULL; if (has_unconditioned) { @@ -916,24 +920,39 @@ class StableDiffusionGGML { control_net->compute(n_threads, noised_input, control_hint, timesteps, uncond.c_crossattn, uncond.c_vector); controls = control_net->controls; } - diffusion_model->compute(n_threads, - noised_input, - timesteps, - uncond.c_crossattn, - uncond.c_concat, - uncond.c_vector, - guidance_tensor, - -1, - controls, - control_strength, - &out_uncond); + if (is_skiplayer_step && slg_params.slg_uncond) { + LOG_DEBUG("Skipping layers at uncond step %d\n", step); + diffusion_model->compute(n_threads, + noised_input, + timesteps, + uncond.c_crossattn, + uncond.c_concat, + uncond.c_vector, + guidance_tensor, + -1, + controls, + control_strength, + &out_uncond, + NULL, + skip_layers); + } else { + diffusion_model->compute(n_threads, + noised_input, + timesteps, + uncond.c_crossattn, + uncond.c_concat, + uncond.c_vector, + guidance_tensor, + -1, + controls, + control_strength, + &out_uncond); + } negative_data = (float*)out_uncond->data; } - int step_count = sigmas.size(); - bool is_skiplayer_step = has_skiplayer && step > (int)(slg_params.skip_layer_start * step_count) && step < (int)(slg_params.skip_layer_end * step_count); float* skip_layer_data = NULL; - if (is_skiplayer_step) { + if (is_skiplayer_step && slg_params.scale != 0.0) { LOG_DEBUG("Skipping layers at step %d\n", step); // skip layer (same as conditionned) diffusion_model->compute(n_threads, @@ -1021,7 +1040,7 @@ class StableDiffusionGGML { latent_result = positive_data[i] + (cfg_scale - 1) * delta; } } - if (is_skiplayer_step) { + if (is_skiplayer_step && slg_params.scale != 0.0) { latent_result = latent_result + (positive_data[i] - skip_layer_data[i]) * slg_params.scale; } // v = latent_result, eps = latent_result diff --git a/stable-diffusion.h b/stable-diffusion.h index 087102e7..c05721da 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -140,6 +140,7 @@ typedef struct { float scale; float skip_layer_start; float skip_layer_end; + bool slg_uncond; } sd_slg_params_t; typedef struct sd_ctx_t sd_ctx_t;