diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index af6b2bbd..c1b04e88 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -126,9 +126,15 @@ struct SDParams { int upscale_repeats = 1; std::vector skip_layers = {7, 8, 9}; - float slg_scale = 0.f; + float slg_scale = 0.0f; float skip_layer_start = 0.01f; float skip_layer_end = 0.2f; + bool slg_uncond = false; + + float apg_eta = 1.0f; + float apg_momentum = 0.0f; + float apg_norm_threshold = 0.0f; + float apg_norm_smoothing = 0.0f; }; void print_params(SDParams params) { @@ -213,13 +219,21 @@ void print_usage(int argc, const char* argv[]) { printf(" -n, --negative-prompt PROMPT the negative prompt (default: \"\")\n"); printf(" --cfg-scale SCALE unconditional guidance scale: (default: 7.0)\n"); printf(" --guidance SCALE guidance scale for img2img (default: 3.5)\n"); + printf(" --apg-eta VALUE parallel projected guidance scale for APG (default: 1.0, recommended: between 0 and 1)\n"); + printf(" --apg-momentum VALUE CFG update direction momentum for APG (default: 0, recommended: around -0.5)\n"); + printf(" --apg-nt, --apg-rescale VALUE CFG update direction norm threshold for APG (default: 0 = disabled, recommended: 4-15)\n"); + printf(" --apg-nt-smoothing VALUE EXPERIMENTAL! Norm threshold smoothing for APG (default: 0 = disabled)\n"); + printf(" (replaces saturation with a smooth approximation)\n"); printf(" --slg-scale SCALE skip layer guidance (SLG) scale, only for DiT models: (default: 0)\n"); printf(" 0 means disabled, a value of 2.5 is nice for sd3.5 medium\n"); - printf(" --eta SCALE eta in DDIM, only for DDIM and TCD: (default: 0)\n"); + printf(" --slg-uncond Use CFG's forward pass for SLG instead of a separate pass, only for DiT models\n"); + printf(" To use this, it's recommended to keep slg-scale to 0, both for performance and quality reasons\n"); + printf(" This should be slightly faster than normal cfg when cfg_scale != 1.\n"); printf(" --skip-layers LAYERS Layers to skip for SLG steps: (default: [7,8,9])\n"); printf(" --skip-layer-start START SLG enabling point: (default: 0.01)\n"); printf(" --skip-layer-end END SLG disabling point: (default: 0.2)\n"); printf(" SLG will be enabled at step int([STEPS]*[START]) and disabled at int([STEPS]*[END])\n"); + printf(" --eta SCALE eta in DDIM, only for DDIM and TCD: (default: 0)\n"); printf(" --strength STRENGTH strength for noising/unnoising (default: 0.75)\n"); printf(" --style-ratio STYLE-RATIO strength for keeping input identity (default: 20%%)\n"); printf(" --control-strength STRENGTH strength to apply Control Net (default: 0.9)\n"); @@ -580,6 +594,8 @@ void parse_args(int argc, const char** argv, SDParams& params) { break; } params.slg_scale = std::stof(argv[i]); + } else if (arg == "--slg-uncond") { + params.slg_uncond = true; } else if (arg == "--skip-layers") { if (++i >= argc) { invalid_arg = true; @@ -629,6 +645,30 @@ void parse_args(int argc, const char** argv, SDParams& params) { break; } params.skip_layer_end = std::stof(argv[i]); + } else if (arg == "--apg-eta") { + if (++i >= argc) { + invalid_arg = true; + break; + } + params.apg_eta = std::stof(argv[i]); + } else if (arg == "--apg-momentum") { + if (++i >= argc) { + invalid_arg = true; + break; + } + params.apg_momentum = std::stof(argv[i]); + } else if (arg == "--apg-nt" || arg == "--apg-rescale") { + if (++i >= argc) { + invalid_arg = true; + break; + } + params.apg_norm_threshold = std::stof(argv[i]); + } else if (arg == "--apg-nt-smoothing") { + if (++i >= argc) { + invalid_arg = true; + break; + } + params.apg_norm_smoothing = std::stof(argv[i]); } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); print_usage(argc, argv); @@ -719,7 +759,20 @@ std::string get_image_params(SDParams params, int64_t seed) { } parameter_string += "Steps: " + std::to_string(params.sample_steps) + ", "; parameter_string += "CFG scale: " + std::to_string(params.cfg_scale) + ", "; + if (params.apg_eta != 1) { + parameter_string += "APG eta: " + std::to_string(params.apg_eta) + ", "; + } + if (params.apg_momentum != 0) { + parameter_string += "CFG momentum: " + std::to_string(params.apg_momentum) + ", "; + } + if (params.apg_norm_threshold != 0) { + parameter_string += "CFG normalization threshold: " + std::to_string(params.apg_norm_threshold) + ", "; + if (params.apg_norm_smoothing != 0) { + parameter_string += "CFG normalization threshold: " + std::to_string(params.apg_norm_smoothing) + ", "; + } + } if (params.slg_scale != 0 && params.skip_layers.size() != 0) { + parameter_string += "Unconditional SLG: " + std::string(params.slg_uncond ? "True" : "False") + ", "; parameter_string += "SLG scale: " + std::to_string(params.cfg_scale) + ", "; parameter_string += "Skip layers: ["; for (const auto& layer : params.skip_layers) { @@ -963,11 +1016,16 @@ int main(int argc, const char* argv[]) { params.style_ratio, params.normalize_input, params.input_id_images_path.c_str(), - params.skip_layers.data(), - params.skip_layers.size(), - params.slg_scale, - params.skip_layer_start, - params.skip_layer_end); + sd_slg_params_t{params.skip_layers.data(), + params.skip_layers.size(), + params.slg_scale, + params.skip_layer_start, + params.skip_layer_end, + params.slg_uncond}, + sd_apg_params_t{params.apg_eta, + params.apg_momentum, + params.apg_norm_threshold, + params.apg_norm_smoothing}); } else { sd_image_t input_image = {(uint32_t)params.width, (uint32_t)params.height, @@ -1032,11 +1090,16 @@ int main(int argc, const char* argv[]) { params.style_ratio, params.normalize_input, params.input_id_images_path.c_str(), - params.skip_layers.data(), - params.skip_layers.size(), - params.slg_scale, - params.skip_layer_start, - params.skip_layer_end); + sd_slg_params_t{params.skip_layers.data(), + params.skip_layers.size(), + params.slg_scale, + params.skip_layer_start, + params.skip_layer_end, + params.slg_uncond}, + sd_apg_params_t{params.apg_eta, + params.apg_momentum, + params.apg_norm_threshold, + params.apg_norm_smoothing}); } } @@ -1075,11 +1138,11 @@ int main(int argc, const char* argv[]) { std::string dummy_name, ext, lc_ext; bool is_jpg; - size_t last = params.output_path.find_last_of("."); + size_t last = params.output_path.find_last_of("."); size_t last_path = std::min(params.output_path.find_last_of("/"), params.output_path.find_last_of("\\")); - if (last != std::string::npos // filename has extension - && (last_path == std::string::npos || last > last_path)) { + if (last != std::string::npos // filename has extension + && (last_path == std::string::npos || last > last_path)) { dummy_name = params.output_path.substr(0, last); ext = lc_ext = params.output_path.substr(last); std::transform(ext.begin(), ext.end(), lc_ext.begin(), ::tolower); @@ -1087,7 +1150,7 @@ int main(int argc, const char* argv[]) { } else { dummy_name = params.output_path; ext = lc_ext = ""; - is_jpg = false; + is_jpg = false; } // appending ".png" to absent or unknown extension if (!is_jpg && lc_ext != ".png") { @@ -1099,7 +1162,7 @@ int main(int argc, const char* argv[]) { continue; } std::string final_image_path = i > 0 ? dummy_name + "_" + std::to_string(i + 1) + ext : dummy_name + ext; - if(is_jpg) { + if (is_jpg) { stbi_write_jpg(final_image_path.c_str(), results[i].width, results[i].height, results[i].channel, results[i].data, 90, get_image_params(params, params.seed + i).c_str()); printf("save result JPEG image to '%s'\n", final_image_path.c_str()); diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index e38a6101..cb79197e 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -800,11 +800,11 @@ class StableDiffusionGGML { const std::vector& sigmas, int start_merge_step, SDCondition id_cond, - std::vector skip_layers = {}, - float slg_scale = 0, - float skip_layer_start = 0.01, - float skip_layer_end = 0.2, - ggml_tensor* noise_mask = nullptr) { + sd_slg_params_t slg_params = {NULL, 0, 0, 0, 0, false}, + sd_apg_params_t apg_params = {1, 0, 0, 0}, + ggml_tensor* noise_mask = nullptr) { + std::vector skip_layers(slg_params.skip_layers, slg_params.skip_layers + slg_params.skip_layers_count); + LOG_DEBUG("Sample"); struct ggml_init_params params; size_t data_size = ggml_row_size(init_latent->type, init_latent->ne[0]); @@ -827,7 +827,7 @@ class StableDiffusionGGML { struct ggml_tensor* noised_input = ggml_dup_tensor(work_ctx, noise); bool has_unconditioned = cfg_scale != 1.0 && uncond.c_crossattn != NULL; - bool has_skiplayer = slg_scale != 0.0 && skip_layers.size() > 0; + bool has_skiplayer = (slg_params.scale != 0.0 || slg_params.slg_uncond) && skip_layers.size() > 0; // denoise wrapper struct ggml_tensor* out_cond = ggml_dup_tensor(work_ctx, x); @@ -839,7 +839,9 @@ class StableDiffusionGGML { } if (has_skiplayer) { if (sd_version_is_dit(version)) { - out_skip = ggml_dup_tensor(work_ctx, x); + if (slg_params.scale != 0.0) { + out_skip = ggml_dup_tensor(work_ctx, x); + } } else { has_skiplayer = false; LOG_WARN("SLG is incompatible with %s models", model_version_to_str[version]); @@ -847,6 +849,10 @@ class StableDiffusionGGML { } struct ggml_tensor* denoised = ggml_dup_tensor(work_ctx, x); + std::vector apg_momentum_buffer; + if (apg_params.momentum != 0) + apg_momentum_buffer.resize((size_t)ggml_nelements(denoised)); + auto denoise = [&](ggml_tensor* input, float sigma, int step) -> ggml_tensor* { if (step == 1) { pretty_progress(0, (int)steps, 0); @@ -904,6 +910,8 @@ class StableDiffusionGGML { control_strength, &out_cond); } + int step_count = sigmas.size(); + bool is_skiplayer_step = has_skiplayer && step > (int)(slg_params.skip_layer_start * step_count) && step < (int)(slg_params.skip_layer_end * step_count); float* negative_data = NULL; if (has_unconditioned) { @@ -912,24 +920,39 @@ class StableDiffusionGGML { control_net->compute(n_threads, noised_input, control_hint, timesteps, uncond.c_crossattn, uncond.c_vector); controls = control_net->controls; } - diffusion_model->compute(n_threads, - noised_input, - timesteps, - uncond.c_crossattn, - uncond.c_concat, - uncond.c_vector, - guidance_tensor, - -1, - controls, - control_strength, - &out_uncond); + if (is_skiplayer_step && slg_params.slg_uncond) { + LOG_DEBUG("Skipping layers at uncond step %d\n", step); + diffusion_model->compute(n_threads, + noised_input, + timesteps, + uncond.c_crossattn, + uncond.c_concat, + uncond.c_vector, + guidance_tensor, + -1, + controls, + control_strength, + &out_uncond, + NULL, + skip_layers); + } else { + diffusion_model->compute(n_threads, + noised_input, + timesteps, + uncond.c_crossattn, + uncond.c_concat, + uncond.c_vector, + guidance_tensor, + -1, + controls, + control_strength, + &out_uncond); + } negative_data = (float*)out_uncond->data; } - int step_count = sigmas.size(); - bool is_skiplayer_step = has_skiplayer && step > (int)(skip_layer_start * step_count) && step < (int)(skip_layer_end * step_count); float* skip_layer_data = NULL; - if (is_skiplayer_step) { + if (is_skiplayer_step && slg_params.scale != 0.0) { LOG_DEBUG("Skipping layers at step %d\n", step); // skip layer (same as conditionned) diffusion_model->compute(n_threads, @@ -951,6 +974,58 @@ class StableDiffusionGGML { float* vec_input = (float*)input->data; float* positive_data = (float*)out_cond->data; int ne_elements = (int)ggml_nelements(denoised); + + float* deltas = vec_denoised; + + // https://arxiv.org/pdf/2410.02416 + float apg_scale_factor = 1.; + float diff_norm = 0; + float cond_norm_sq = 0; + float dot = 0; + if (has_unconditioned) { + for (int i = 0; i < ne_elements; i++) { + float delta = positive_data[i] - negative_data[i]; + if (apg_params.momentum != 0) { + delta += apg_params.momentum * apg_momentum_buffer[i]; + apg_momentum_buffer[i] = delta; + } + if (apg_params.norm_treshold > 0) { + diff_norm += delta * delta; + } + if (apg_params.eta != 1.0f) { + cond_norm_sq += positive_data[i] * positive_data[i]; + dot += positive_data[i] * delta; + } + deltas[i] = delta; + } + if (apg_params.norm_treshold > 0) { + diff_norm = sqrtf(diff_norm); + if (apg_params.norm_treshold_smoothing <= 0) { + apg_scale_factor = std::min(1.0f, apg_params.norm_treshold / diff_norm); + } else { + // Experimental: smooth saturate + float x = apg_params.norm_treshold / diff_norm; + apg_scale_factor = x / std::pow(1 + std::pow(x, 1.0 / apg_params.norm_treshold_smoothing), apg_params.norm_treshold_smoothing); + } + } + if (apg_params.eta != 1.0f) { + dot *= apg_scale_factor; + // pre-normalize (avoids one square root and ne_elements extra divs) + dot /= cond_norm_sq; + } + + for (int i = 0; i < ne_elements; i++) { + deltas[i] *= apg_scale_factor; + if (apg_params.eta != 1.0f) { + float apg_parallel = dot * positive_data[i]; + float apg_orthogonal = deltas[i] - apg_parallel; + + // tweak deltas + deltas[i] = apg_orthogonal + apg_params.eta * apg_parallel; + } + } + } + for (int i = 0; i < ne_elements; i++) { float latent_result = positive_data[i]; if (has_unconditioned) { @@ -960,11 +1035,13 @@ class StableDiffusionGGML { int64_t i3 = i / out_cond->ne[0] * out_cond->ne[1] * out_cond->ne[2]; float scale = min_cfg + (cfg_scale - min_cfg) * (i3 * 1.0f / ne3); } else { - latent_result = negative_data[i] + cfg_scale * (positive_data[i] - negative_data[i]); + float delta = deltas[i]; + + latent_result = positive_data[i] + (cfg_scale - 1) * delta; } } - if (is_skiplayer_step) { - latent_result = latent_result + (positive_data[i] - skip_layer_data[i]) * slg_scale; + if (is_skiplayer_step && slg_params.scale != 0.0) { + latent_result = latent_result + (positive_data[i] - skip_layer_data[i]) * slg_params.scale; } // v = latent_result, eps = latent_result // denoised = (v * c_out + input * c_skip) or (input + eps * c_out) @@ -1004,7 +1081,8 @@ class StableDiffusionGGML { } // ldm.models.diffusion.ddpm.LatentDiffusion.get_first_stage_encoding - ggml_tensor* get_first_stage_encoding(ggml_context* work_ctx, ggml_tensor* moments) { + ggml_tensor* + get_first_stage_encoding(ggml_context* work_ctx, ggml_tensor* moments) { // ldm.modules.distributions.distributions.DiagonalGaussianDistribution.sample ggml_tensor* latent = ggml_new_tensor_4d(work_ctx, moments->type, moments->ne[0], moments->ne[1], moments->ne[2] / 2, moments->ne[3]); struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, latent); @@ -1209,11 +1287,9 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, float style_ratio, bool normalize_input, std::string input_id_images_path, - std::vector skip_layers = {}, - float slg_scale = 0, - float skip_layer_start = 0.01, - float skip_layer_end = 0.2, - ggml_tensor* masked_image = NULL) { + sd_slg_params_t slg_params, + sd_apg_params_t apg_params, + ggml_tensor* masked_image = NULL) { if (seed < 0) { // Generally, when using the provided command line, the seed is always >0. // However, to prevent potential issues if 'stable-diffusion.cpp' is invoked as a library @@ -1466,10 +1542,8 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, sigmas, start_merge_step, id_cond, - skip_layers, - slg_scale, - skip_layer_start, - skip_layer_end, + slg_params, + apg_params, noise_mask); // struct ggml_tensor* x_0 = load_tensor_from_file(ctx, "samples_ddim.bin"); @@ -1539,12 +1613,8 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx, float style_ratio, bool normalize_input, const char* input_id_images_path_c_str, - int* skip_layers = NULL, - size_t skip_layers_count = 0, - float slg_scale = 0, - float skip_layer_start = 0.01, - float skip_layer_end = 0.2) { - std::vector skip_layers_vec(skip_layers, skip_layers + skip_layers_count); + sd_slg_params_t slg_params, + sd_apg_params_t apg_params) { LOG_DEBUG("txt2img %dx%d", width, height); if (sd_ctx == NULL) { return NULL; @@ -1618,10 +1688,8 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx, style_ratio, normalize_input, input_id_images_path_c_str, - skip_layers_vec, - slg_scale, - skip_layer_start, - skip_layer_end); + slg_params, + apg_params); size_t t1 = ggml_time_ms(); @@ -1651,12 +1719,8 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, float style_ratio, bool normalize_input, const char* input_id_images_path_c_str, - int* skip_layers = NULL, - size_t skip_layers_count = 0, - float slg_scale = 0, - float skip_layer_start = 0.01, - float skip_layer_end = 0.2) { - std::vector skip_layers_vec(skip_layers, skip_layers + skip_layers_count); + sd_slg_params_t slg_params, + sd_apg_params_t apg_params) { LOG_DEBUG("img2img %dx%d", width, height); if (sd_ctx == NULL) { return NULL; @@ -1798,10 +1862,8 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, style_ratio, normalize_input, input_id_images_path_c_str, - skip_layers_vec, - slg_scale, - skip_layer_start, - skip_layer_end, + slg_params, + apg_params, masked_image); size_t t2 = ggml_time_ms(); diff --git a/stable-diffusion.h b/stable-diffusion.h index 52dcc848..c05721da 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -127,6 +127,22 @@ typedef struct { uint8_t* data; } sd_image_t; +typedef struct { + float eta; + float momentum; + float norm_treshold; + float norm_treshold_smoothing; +} sd_apg_params_t; + +typedef struct { + int* skip_layers; + size_t skip_layers_count; + float scale; + float skip_layer_start; + float skip_layer_end; + bool slg_uncond; +} sd_slg_params_t; + typedef struct sd_ctx_t sd_ctx_t; SD_API sd_ctx_t* new_sd_ctx(const char* model_path, @@ -172,11 +188,8 @@ SD_API sd_image_t* txt2img(sd_ctx_t* sd_ctx, float style_strength, bool normalize_input, const char* input_id_images_path, - int* skip_layers, - size_t skip_layers_count, - float slg_scale, - float skip_layer_start, - float skip_layer_end); + sd_slg_params_t slg_params, + sd_apg_params_t apg_params); SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx, sd_image_t init_image, @@ -199,11 +212,8 @@ SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx, float style_strength, bool normalize_input, const char* input_id_images_path, - int* skip_layers, - size_t skip_layers_count, - float slg_scale, - float skip_layer_start, - float skip_layer_end); + sd_slg_params_t slg_params, + sd_apg_params_t apg_params); SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx, sd_image_t init_image,