diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index b3ae569e6..bcdce4b63 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -102,9 +102,15 @@ struct SDParams { int upscale_repeats = 1; std::vector skip_layers = {7, 8, 9}; - float slg_scale = 0.f; + float slg_scale = 0.0f; float skip_layer_start = 0.01f; float skip_layer_end = 0.2f; + bool slg_uncond = false; + + float apg_eta = 1.0f; + float apg_momentum = 0.0f; + float apg_norm_threshold = 0.0f; + float apg_norm_smoothing = 0.0f; bool chroma_use_dit_mask = true; bool chroma_use_t5_mask = false; @@ -204,13 +210,21 @@ void print_usage(int argc, const char* argv[]) { printf(" --cfg-scale SCALE unconditional guidance scale: (default: 7.0)\n"); printf(" --img-cfg-scale SCALE image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)\n"); printf(" --guidance SCALE distilled guidance scale for models with guidance input (default: 3.5)\n"); + printf(" --apg-eta VALUE parallel projected guidance scale for APG (default: 1.0, recommended: between 0 and 1)\n"); + printf(" --apg-momentum VALUE Momentum for guidance adjustments with APG (default: 0, recommended: around -0.5 (negative))\n"); + printf(" --apg-nt VALUE APG norm threshold: Upper bound allowed for the amplitude (L2 norm) of guidance updates (default: 0 = disabled, recommended: 4-15)\n"); + printf(" --apg-nt-smoothing VALUE EXPERIMENTAL! Norm threshold smoothing for APG, smoothly decrease the amplitude of the guidance update if it gets too close to the norm threshold (default: 0 = disabled)\n"); + printf(" (replaces saturation with a smooth approximation)\n"); printf(" --slg-scale SCALE skip layer guidance (SLG) scale, only for DiT models: (default: 0)\n"); printf(" 0 means disabled, a value of 2.5 is nice for sd3.5 medium\n"); - printf(" --eta SCALE eta in DDIM, only for DDIM and TCD: (default: 0)\n"); + printf(" --slg-uncond Use CFG's forward pass for SLG instead of a separate pass, only for DiT models\n"); + printf(" To use this, it's recommended to keep slg-scale to 0, both for performance and quality reasons\n"); + printf(" This should be slightly faster than normal cfg when cfg_scale != 1.\n"); printf(" --skip-layers LAYERS Layers to skip for SLG steps: (default: [7,8,9])\n"); printf(" --skip-layer-start START SLG enabling point: (default: 0.01)\n"); printf(" --skip-layer-end END SLG disabling point: (default: 0.2)\n"); printf(" SLG will be enabled at step int([STEPS]*[START]) and disabled at int([STEPS]*[END])\n"); + printf(" --eta SCALE eta in DDIM, only for DDIM and TCD: (default: 0)\n"); printf(" --strength STRENGTH strength for noising/unnoising (default: 0.75)\n"); printf(" --style-ratio STYLE-RATIO strength for keeping input identity (default: 20)\n"); printf(" --control-strength STRENGTH strength to apply Control Net (default: 0.9)\n"); @@ -412,7 +426,10 @@ void parse_args(int argc, const char** argv, SDParams& params) { {"", "--slg-scale", "", ¶ms.slg_scale}, {"", "--skip-layer-start", "", ¶ms.skip_layer_start}, {"", "--skip-layer-end", "", ¶ms.skip_layer_end}, - + {"", "--apg-eta", "", ¶ms.apg_eta}, + {"", "--apg-momentum", "", ¶ms.apg_momentum}, + {"", "--apg-nt", "", ¶ms.apg_norm_threshold}, + {"", "--apg-nt-smoothing", "", ¶ms.apg_norm_smoothing}, }; options.bool_options = { @@ -425,6 +442,7 @@ void parse_args(int argc, const char** argv, SDParams& params) { {"", "--canny", "", true, ¶ms.canny_preprocess}, {"-v", "--verbos", "", true, ¶ms.verbose}, {"", "--color", "", true, ¶ms.color}, + {"", "--slg-uncond", "", true, ¶ms.slg_uncond}, {"", "--chroma-disable-dit-mask", "", false, ¶ms.chroma_use_dit_mask}, {"", "--chroma-enable-t5-mask", "", true, ¶ms.chroma_use_t5_mask}, }; @@ -660,7 +678,20 @@ std::string get_image_params(SDParams params, int64_t seed) { } parameter_string += "Steps: " + std::to_string(params.sample_steps) + ", "; parameter_string += "CFG scale: " + std::to_string(params.cfg_scale) + ", "; + if (params.apg_eta != 1) { + parameter_string += "APG eta: " + std::to_string(params.apg_eta) + ", "; + } + if (params.apg_momentum != 0) { + parameter_string += "CFG momentum: " + std::to_string(params.apg_momentum) + ", "; + } + if (params.apg_norm_threshold != 0) { + parameter_string += "CFG normalization threshold: " + std::to_string(params.apg_norm_threshold) + ", "; + if (params.apg_norm_smoothing != 0) { + parameter_string += "CFG normalization threshold: " + std::to_string(params.apg_norm_smoothing) + ", "; + } + } if (params.slg_scale != 0 && params.skip_layers.size() != 0) { + parameter_string += "Unconditional SLG: " + std::string(params.slg_uncond ? "True" : "False") + ", "; parameter_string += "SLG scale: " + std::to_string(params.cfg_scale) + ", "; parameter_string += "Skip layers: ["; for (const auto& layer : params.skip_layers) { @@ -733,17 +764,25 @@ int main(int argc, const char* argv[]) { parse_args(argc, argv, params); - sd_guidance_params_t guidance_params = {params.cfg_scale, - params.img_cfg_scale, - params.min_cfg, - params.guidance, - { - params.skip_layers.data(), - params.skip_layers.size(), - params.skip_layer_start, - params.skip_layer_end, - params.slg_scale, - }}; + sd_guidance_params_t guidance_params = { + params.cfg_scale, + params.img_cfg_scale, + params.min_cfg, + params.guidance, + { + params.skip_layers.data(), + params.skip_layers.size(), + params.skip_layer_start, + params.skip_layer_end, + params.slg_scale, + }, + { + params.apg_eta, + params.apg_momentum, + params.apg_norm_threshold, + params.apg_norm_smoothing, + }, + }; sd_set_log_callback(sd_log_cb, (void*)¶ms); diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 402585f1c..733365490 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -885,7 +885,7 @@ class StableDiffusionGGML { bool has_unconditioned = img_cfg_scale != 1.0 && uncond.c_crossattn != NULL; bool has_img_cond = cfg_scale != img_cfg_scale && img_cond.c_crossattn != NULL; - bool has_skiplayer = slg_scale != 0.0 && skip_layers.size() > 0; + bool has_skiplayer = (slg_scale != 0.0 || guidance.slg.uncond) && skip_layers.size() > 0; // denoise wrapper struct ggml_tensor* out_cond = ggml_dup_tensor(work_ctx, x); @@ -898,7 +898,9 @@ class StableDiffusionGGML { } if (has_skiplayer) { if (sd_version_is_dit(version)) { - out_skip = ggml_dup_tensor(work_ctx, x); + if (slg_scale != 0.0) { + out_skip = ggml_dup_tensor(work_ctx, x); + } } else { has_skiplayer = false; LOG_WARN("SLG is incompatible with %s models", model_version_to_str[version]); @@ -909,6 +911,10 @@ class StableDiffusionGGML { } struct ggml_tensor* denoised = ggml_dup_tensor(work_ctx, x); + std::vector apg_momentum_buffer; + if (guidance.apg.momentum != 0) + apg_momentum_buffer.resize((size_t)ggml_nelements(denoised)); + auto denoise = [&](ggml_tensor* input, float sigma, int step) -> ggml_tensor* { if (step == 1) { pretty_progress(0, (int)steps, 0); @@ -968,6 +974,8 @@ class StableDiffusionGGML { control_strength, &out_cond); } + int step_count = sigmas.size(); + bool is_skiplayer_step = has_skiplayer && step > (int)(guidance.slg.layer_start * step_count) && step < (int)(guidance.slg.layer_end * step_count); float* negative_data = NULL; if (has_unconditioned) { @@ -976,18 +984,36 @@ class StableDiffusionGGML { control_net->compute(n_threads, noised_input, control_hint, timesteps, uncond.c_crossattn, uncond.c_vector); controls = control_net->controls; } - diffusion_model->compute(n_threads, - noised_input, - timesteps, - uncond.c_crossattn, - uncond.c_concat, - uncond.c_vector, - guidance_tensor, - ref_latents, - -1, - controls, - control_strength, - &out_uncond); + if (is_skiplayer_step && guidance.slg.uncond) { + LOG_DEBUG("Skipping layers at uncond step %d\n", step); + diffusion_model->compute(n_threads, + noised_input, + timesteps, + uncond.c_crossattn, + uncond.c_concat, + uncond.c_vector, + guidance_tensor, + ref_latents, + -1, + controls, + control_strength, + &out_uncond, + NULL, + skip_layers); + } else { + diffusion_model->compute(n_threads, + noised_input, + timesteps, + uncond.c_crossattn, + uncond.c_concat, + uncond.c_vector, + guidance_tensor, + ref_latents, + -1, + controls, + control_strength, + &out_uncond); + } negative_data = (float*)out_uncond->data; } @@ -1008,10 +1034,8 @@ class StableDiffusionGGML { img_cond_data = (float*)out_img_cond->data; } - int step_count = sigmas.size(); - bool is_skiplayer_step = has_skiplayer && step > (int)(guidance.slg.layer_start * step_count) && step < (int)(guidance.slg.layer_end * step_count); float* skip_layer_data = NULL; - if (is_skiplayer_step) { + if (is_skiplayer_step && slg_scale != 0.0) { LOG_DEBUG("Skipping layers at step %d\n", step); // skip layer (same as conditionned) diffusion_model->compute(n_threads, @@ -1034,6 +1058,87 @@ class StableDiffusionGGML { float* vec_input = (float*)input->data; float* positive_data = (float*)out_cond->data; int ne_elements = (int)ggml_nelements(denoised); + + float* deltas = vec_denoised; + + // APG: https://arxiv.org/pdf/2410.02416 + + bool log_cfg_norm = false; + const char* SD_LOG_CFG_DELTA_NORM = getenv("SD_LOG_CFG_DELTA_NORM"); + if (SD_LOG_CFG_DELTA_NORM != nullptr) { + std::string sd_log_cfg_norm_str = SD_LOG_CFG_DELTA_NORM; + if (sd_log_cfg_norm_str == "ON" || sd_log_cfg_norm_str == "TRUE") { + log_cfg_norm = true; + } else if (sd_log_cfg_norm_str != "OFF" && sd_log_cfg_norm_str != "FALSE") { + LOG_WARN("SD_LOG_CFG_DELTA_NORM environment variable has unexpected value. Assuming default (\"OFF\"). (Expected \"ON\"/\"TRUE\" or\"OFF\"/\"FALSE\", got \"%s\")", SD_LOG_CFG_DELTA_NORM); + } + } + float apg_scale_factor = 1.; + float diff_norm = 0; + float cond_norm_sq = 0; + float dot = 0; + if (has_unconditioned || has_img_cond) { + for (int i = 0; i < ne_elements; i++) { + float delta; + if (has_img_cond) { + if (cfg_scale == 1) { + // Weird guidance (important: use img_cfg_scale instead of cfg_scale in the final formula) + delta = img_cond_data[i] - negative_data[i]; + } else if (has_unconditioned) { + // 2-conditioning CFG (img_cfg_scale != cfg_scale != 1) + delta = positive_data[i] + (negative_data[i] * (1 - img_cfg_scale) + img_cond_data[i] * (img_cfg_scale - cfg_scale)) / (cfg_scale - 1); + } else { + // pure img CFG (img_cfg_scale == 1, cfg_scale !=1) + delta = positive_data[i] - img_cond_data[i]; + } + } else { + // classic CFG (img_cfg_scale == cfg_scale != 1) + delta = positive_data[i] - negative_data[i]; + } + if (guidance.apg.momentum != 0) { + delta += guidance.apg.momentum * apg_momentum_buffer[i]; + apg_momentum_buffer[i] = delta; + } + if (guidance.apg.norm_treshold > 0 || log_cfg_norm) { + diff_norm += delta * delta; + } + if (guidance.apg.eta != 1.0f) { + cond_norm_sq += positive_data[i] * positive_data[i]; + dot += positive_data[i] * delta; + } + deltas[i] = delta; + } + if (log_cfg_norm) { + LOG_INFO("CFG Delta norm: %.2f", sqrtf(diff_norm)); + } + if (guidance.apg.norm_treshold > 0) { + diff_norm = sqrtf(diff_norm); + if (guidance.apg.norm_treshold_smoothing <= 0) { + apg_scale_factor = std::min(1.0f, guidance.apg.norm_treshold / diff_norm); + } else { + // Experimental: smooth saturate + float x = guidance.apg.norm_treshold / diff_norm; + apg_scale_factor = x / std::pow(1 + std::pow(x, 1.0 / guidance.apg.norm_treshold_smoothing), guidance.apg.norm_treshold_smoothing); + } + } + if (guidance.apg.eta != 1.0f) { + dot *= apg_scale_factor; + // pre-normalize (avoids one square root and ne_elements extra divs) + dot /= cond_norm_sq; + } + + for (int i = 0; i < ne_elements; i++) { + deltas[i] *= apg_scale_factor; + if (guidance.apg.eta != 1.0f) { + float apg_parallel = dot * positive_data[i]; + float apg_orthogonal = deltas[i] - apg_parallel; + + // tweak deltas + deltas[i] = apg_orthogonal + guidance.apg.eta * apg_parallel; + } + } + } + for (int i = 0; i < ne_elements; i++) { float latent_result = positive_data[i]; if (has_unconditioned) { @@ -1043,19 +1148,19 @@ class StableDiffusionGGML { int64_t i3 = i / out_cond->ne[0] * out_cond->ne[1] * out_cond->ne[2]; float scale = min_cfg + (cfg_scale - min_cfg) * (i3 * 1.0f / ne3); } else { - if (has_img_cond) { - // out_uncond + text_cfg_scale * (out_cond - out_img_cond) + image_cfg_scale * (out_img_cond - out_uncond) - latent_result = negative_data[i] + img_cfg_scale * (img_cond_data[i] - negative_data[i]) + cfg_scale * (positive_data[i] - img_cond_data[i]); - } else { - // img_cfg_scale == cfg_scale - latent_result = negative_data[i] + cfg_scale * (positive_data[i] - negative_data[i]); + float delta = deltas[i]; + + if (cfg_scale != 1) { + latent_result = positive_data[i] + (cfg_scale - 1) * delta; + } else if (has_img_cond) { + latent_result = positive_data[i] + (img_cfg_scale - 1) * delta; } } } else if (has_img_cond) { // img_cfg_scale == 1 latent_result = img_cond_data[i] + cfg_scale * (positive_data[i] - img_cond_data[i]); } - if (is_skiplayer_step) { + if (is_skiplayer_step && slg_scale != 0.0) { latent_result = latent_result + (positive_data[i] - skip_layer_data[i]) * slg_scale; } // v = latent_result, eps = latent_result @@ -1096,7 +1201,8 @@ class StableDiffusionGGML { } // ldm.models.diffusion.ddpm.LatentDiffusion.get_first_stage_encoding - ggml_tensor* get_first_stage_encoding(ggml_context* work_ctx, ggml_tensor* moments) { + ggml_tensor* + get_first_stage_encoding(ggml_context* work_ctx, ggml_tensor* moments) { // ldm.modules.distributions.distributions.DiagonalGaussianDistribution.sample ggml_tensor* latent = ggml_new_tensor_4d(work_ctx, moments->type, moments->ne[0], moments->ne[1], moments->ne[2] / 2, moments->ne[3]); struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, latent); diff --git a/stable-diffusion.h b/stable-diffusion.h index a60325923..3ed56353b 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -152,14 +152,23 @@ typedef struct { float layer_start; float layer_end; float scale; + bool uncond; } sd_slg_params_t; +typedef struct { + float eta; + float momentum; + float norm_treshold; + float norm_treshold_smoothing; +} sd_apg_params_t; + typedef struct { float txt_cfg; float img_cfg; float min_cfg; float distilled_guidance; sd_slg_params_t slg; + sd_apg_params_t apg; } sd_guidance_params_t; typedef struct {