leejet
diff --git a/‎common.hpp
Lines changed: 1 addition & 1 deletion b/‎common.hpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎conditioner.hpp
Lines changed: 241 additions & 15 deletions b/‎conditioner.hpp
Lines changed: 241 additions & 15 deletions
diff --git a/‎control.hpp
Lines changed: 9 additions & 9 deletions b/‎control.hpp
Lines changed: 9 additions & 9 deletions
diff --git a/‎denoiser.hpp
Lines changed: 64 additions & 3 deletions b/‎denoiser.hpp
Lines changed: 64 additions & 3 deletions
diff --git a/‎diffusion_model.hpp
Lines changed: 56 additions & 2 deletions b/‎diffusion_model.hpp
Lines changed: 56 additions & 2 deletions
diff --git a/‎examples/cli/main.cpp
Lines changed: 41 additions & 4 deletions b/‎examples/cli/main.cpp
Lines changed: 41 additions & 4 deletions
@@ -367,7 +367,7 @@ class SpatialTransformer : public GGMLBlock {
     int64_t n_head;
     int64_t d_head;
     int64_t depth       = 1;    // 1
-    int64_t context_dim = 768;  // hidden_size, 1024 for VERSION_2_x
+    int64_t context_dim = 768;  // hidden_size, 1024 for VERSION_SD2
 
 public:
     SpatialTransformer(int64_t in_channels,
 
@@ -14,7 +14,7 @@
 */
 class ControlNetBlock : public GGMLBlock {
 protected:
-    SDVersion version = VERSION_1_x;
+    SDVersion version = VERSION_SD1;
     // network hparams
     int in_channels                        = 4;
     int out_channels                       = 4;
@@ -26,19 +26,19 @@ class ControlNetBlock : public GGMLBlock {
     int time_embed_dim                     = 1280;  // model_channels*4
     int num_heads                          = 8;
     int num_head_channels                  = -1;   // channels // num_heads
-    int context_dim                        = 768;  // 1024 for VERSION_2_x, 2048 for VERSION_XL
+    int context_dim                        = 768;  // 1024 for VERSION_SD2, 2048 for VERSION_SDXL
 
 public:
     int model_channels  = 320;
-    int adm_in_channels = 2816;  // only for VERSION_XL
+    int adm_in_channels = 2816;  // only for VERSION_SDXL
 
-    ControlNetBlock(SDVersion version = VERSION_1_x)
+    ControlNetBlock(SDVersion version = VERSION_SD1)
         : version(version) {
-        if (version == VERSION_2_x) {
+        if (version == VERSION_SD2) {
             context_dim       = 1024;
             num_head_channels = 64;
             num_heads         = -1;
-        } else if (version == VERSION_XL) {
+        } else if (version == VERSION_SDXL) {
             context_dim           = 2048;
             attention_resolutions = {4, 2};
             channel_mult          = {1, 2, 4};
@@ -58,7 +58,7 @@ class ControlNetBlock : public GGMLBlock {
         // time_embed_1 is nn.SiLU()
         blocks["time_embed.2"] = std::shared_ptr<GGMLBlock>(new Linear(time_embed_dim, time_embed_dim));
 
-        if (version == VERSION_XL || version == VERSION_SVD) {
+        if (version == VERSION_SDXL || version == VERSION_SVD) {
             blocks["label_emb.0.0"] = std::shared_ptr<GGMLBlock>(new Linear(adm_in_channels, time_embed_dim));
             // label_emb_1 is nn.SiLU()
             blocks["label_emb.0.2"] = std::shared_ptr<GGMLBlock>(new Linear(time_embed_dim, time_embed_dim));
@@ -307,7 +307,7 @@ class ControlNetBlock : public GGMLBlock {
 };
 
 struct ControlNet : public GGMLRunner {
-    SDVersion version = VERSION_1_x;
+    SDVersion version = VERSION_SD1;
     ControlNetBlock control_net;
 
     ggml_backend_buffer_t control_buffer = NULL;  // keep control output tensors in backend memory
@@ -318,7 +318,7 @@ struct ControlNet : public GGMLRunner {
 
     ControlNet(ggml_backend_t backend,
                ggml_type wtype,
-               SDVersion version = VERSION_1_x)
+               SDVersion version = VERSION_SD1)
         : GGMLRunner(backend, wtype), control_net(version) {
         control_net.init(params_ctx, wtype);
     }
 
@@ -8,6 +8,7 @@
 // Ref: https://github.com/crowsonkb/k-diffusion/blob/master/k_diffusion/external.py
 
 #define TIMESTEPS 1000
+#define FLUX_TIMESTEPS 1000
 
 struct SigmaSchedule {
     int version = 0;
@@ -144,13 +145,13 @@ struct AYSSchedule : SigmaSchedule {
         std::vector<float> results(n + 1);
 
         switch (version) {
-            case VERSION_2_x: /* fallthrough */
+            case VERSION_SD2: /* fallthrough */
                 LOG_WARN("AYS not designed for SD2.X models");
-            case VERSION_1_x:
+            case VERSION_SD1:
                 LOG_INFO("AYS using SD1.5 noise levels");
                 inputs = noise_levels[0];
                 break;
-            case VERSION_XL:
+            case VERSION_SDXL:
                 LOG_INFO("AYS using SDXL noise levels");
                 inputs = noise_levels[1];
                 break;
@@ -350,6 +351,66 @@ struct DiscreteFlowDenoiser : public Denoiser {
     }
 };
 
+
+float flux_time_shift(float mu, float sigma, float t) {
+    return std::exp(mu) / (std::exp(mu) + std::pow((1.0 / t - 1.0), sigma));
+}
+
+struct FluxFlowDenoiser : public Denoiser {
+    float sigmas[TIMESTEPS];
+    float shift = 1.15f;
+
+    float sigma_data = 1.0f;
+
+    FluxFlowDenoiser(float shift = 1.15f) {
+        set_parameters(shift);
+    }
+
+    void set_parameters(float shift = 1.15f) {
+        this->shift = shift;
+        for (int i = 1; i < TIMESTEPS + 1; i++) {
+            sigmas[i - 1] = t_to_sigma(i/TIMESTEPS * TIMESTEPS);
+        }
+    }
+
+    float sigma_min() {
+        return sigmas[0];
+    }
+
+    float sigma_max() {
+        return sigmas[TIMESTEPS - 1];
+    }
+
+    float sigma_to_t(float sigma) {
+        return sigma;
+    }
+
+    float t_to_sigma(float t) {
+        t = t + 1;
+        return flux_time_shift(shift, 1.0f, t / TIMESTEPS);
+    }
+
+    std::vector<float> get_scalings(float sigma) {
+        float c_skip = 1.0f;
+        float c_out  = -sigma;
+        float c_in   = 1.0f;
+        return {c_skip, c_out, c_in};
+    }
+
+    // this function will modify noise/latent
+    ggml_tensor* noise_scaling(float sigma, ggml_tensor* noise, ggml_tensor* latent) {
+        ggml_tensor_scale(noise, sigma);
+        ggml_tensor_scale(latent, 1.0f - sigma);
+        ggml_tensor_add(latent, noise);
+        return latent;
+    }
+
+    ggml_tensor* inverse_noise_scaling(float sigma, ggml_tensor* latent) {
+        ggml_tensor_scale(latent, 1.0f / (1.0f - sigma));
+        return latent;
+    }
+};
+
 typedef std::function<ggml_tensor*(ggml_tensor*, float, int)> denoise_cb_t;
 
 // k diffusion reverse ODE: dx = (x - D(x;\sigma)) / \sigma dt; \sigma(t) = t
 
@@ -3,6 +3,7 @@
 
 #include "mmdit.hpp"
 #include "unet.hpp"
+#include "flux.hpp"
 
 struct DiffusionModel {
     virtual void compute(int n_threads,
@@ -11,6 +12,7 @@ struct DiffusionModel {
                          struct ggml_tensor* context,
                          struct ggml_tensor* c_concat,
                          struct ggml_tensor* y,
+                         struct ggml_tensor* guidance,
                          int num_video_frames                      = -1,
                          std::vector<struct ggml_tensor*> controls = {},
                          float control_strength                    = 0.f,
@@ -29,7 +31,7 @@ struct UNetModel : public DiffusionModel {
 
     UNetModel(ggml_backend_t backend,
               ggml_type wtype,
-              SDVersion version = VERSION_1_x)
+              SDVersion version = VERSION_SD1)
         : unet(backend, wtype, version) {
     }
 
@@ -63,6 +65,7 @@ struct UNetModel : public DiffusionModel {
                  struct ggml_tensor* context,
                  struct ggml_tensor* c_concat,
                  struct ggml_tensor* y,
+                 struct ggml_tensor* guidance,
                  int num_video_frames                      = -1,
                  std::vector<struct ggml_tensor*> controls = {},
                  float control_strength                    = 0.f,
@@ -77,7 +80,7 @@ struct MMDiTModel : public DiffusionModel {
 
     MMDiTModel(ggml_backend_t backend,
                ggml_type wtype,
-               SDVersion version = VERSION_3_2B)
+               SDVersion version = VERSION_SD3_2B)
         : mmdit(backend, wtype, version) {
     }
 
@@ -111,6 +114,7 @@ struct MMDiTModel : public DiffusionModel {
                  struct ggml_tensor* context,
                  struct ggml_tensor* c_concat,
                  struct ggml_tensor* y,
+                 struct ggml_tensor* guidance,
                  int num_video_frames                      = -1,
                  std::vector<struct ggml_tensor*> controls = {},
                  float control_strength                    = 0.f,
@@ -120,4 +124,54 @@ struct MMDiTModel : public DiffusionModel {
     }
 };
 
+
+struct FluxModel : public DiffusionModel {
+    Flux::FluxRunner flux;
+
+    FluxModel(ggml_backend_t backend,
+               ggml_type wtype,
+               SDVersion version = VERSION_FLUX_DEV)
+        : flux(backend, wtype, version) {
+    }
+
+    void alloc_params_buffer() {
+        flux.alloc_params_buffer();
+    }
+
+    void free_params_buffer() {
+        flux.free_params_buffer();
+    }
+
+    void free_compute_buffer() {
+        flux.free_compute_buffer();
+    }
+
+    void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
+        flux.get_param_tensors(tensors, "model.diffusion_model");
+    }
+
+    size_t get_params_buffer_size() {
+        return flux.get_params_buffer_size();
+    }
+
+    int64_t get_adm_in_channels() {
+        return 768;
+    }
+
+    void compute(int n_threads,
+                 struct ggml_tensor* x,
+                 struct ggml_tensor* timesteps,
+                 struct ggml_tensor* context,
+                 struct ggml_tensor* c_concat,
+                 struct ggml_tensor* y,
+                 struct ggml_tensor* guidance,
+                 int num_video_frames                      = -1,
+                 std::vector<struct ggml_tensor*> controls = {},
+                 float control_strength                    = 0.f,
+                 struct ggml_tensor** output               = NULL,
+                 struct ggml_context* output_ctx           = NULL) {
+        return flux.compute(n_threads, x, timesteps, context, y, guidance, output, output_ctx);
+    }
+};
+
 #endif
@@ -7,9 +7,8 @@
 #include <vector>
 
 // #include "preprocessing.hpp"
-#include "mmdit.hpp"
+#include "flux.hpp"
 #include "stable-diffusion.h"
-#include "t5.hpp"
 
 #define STB_IMAGE_IMPLEMENTATION
 #define STB_IMAGE_STATIC
@@ -68,6 +67,9 @@ struct SDParams {
     SDMode mode   = TXT2IMG;
 
     std::string model_path;
+    std::string clip_l_path;
+    std::string t5xxl_path;
+    std::string diffusion_model_path;
     std::string vae_path;
     std::string taesd_path;
     std::string esrgan_path;
@@ -85,6 +87,7 @@ struct SDParams {
     std::string negative_prompt;
     float min_cfg     = 1.0f;
     float cfg_scale   = 7.0f;
+    float guidance    = 3.5f;
     float style_ratio = 20.f;
     int clip_skip     = -1;  // <= 0 represents unspecified
     int width         = 512;
@@ -120,6 +123,9 @@ void print_params(SDParams params) {
     printf("    mode:              %s\n", modes_str[params.mode]);
     printf("    model_path:        %s\n", params.model_path.c_str());
     printf("    wtype:             %s\n", params.wtype < SD_TYPE_COUNT ? sd_type_name(params.wtype) : "unspecified");
+    printf("    clip_l_path:       %s\n", params.clip_l_path.c_str());
+    printf("    t5xxl_path:        %s\n", params.t5xxl_path.c_str());
+    printf("    diffusion_model_path:   %s\n", params.diffusion_model_path.c_str());
     printf("    vae_path:          %s\n", params.vae_path.c_str());
     printf("    taesd_path:        %s\n", params.taesd_path.c_str());
     printf("    esrgan_path:       %s\n", params.esrgan_path.c_str());
@@ -140,6 +146,7 @@ void print_params(SDParams params) {
     printf("    negative_prompt:   %s\n", params.negative_prompt.c_str());
     printf("    min_cfg:           %.2f\n", params.min_cfg);
     printf("    cfg_scale:         %.2f\n", params.cfg_scale);
+    printf("    guidance:          %.2f\n", params.guidance);
     printf("    clip_skip:         %d\n", params.clip_skip);
     printf("    width:             %d\n", params.width);
     printf("    height:            %d\n", params.height);
@@ -240,6 +247,24 @@ void parse_args(int argc, const char** argv, SDParams& params) {
                 break;
             }
             params.model_path = argv[i];
+        } else if (arg == "--clip_l") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.clip_l_path = argv[i];
+        } else if (arg == "--t5xxl") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.t5xxl_path = argv[i];
+        } else if (arg == "--diffusion-model") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.diffusion_model_path = argv[i];
         } else if (arg == "--vae") {
             if (++i >= argc) {
                 invalid_arg = true;
@@ -359,6 +384,12 @@ void parse_args(int argc, const char** argv, SDParams& params) {
                 break;
             }
             params.cfg_scale = std::stof(argv[i]);
+        } else if (arg == "--guidance") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.guidance = std::stof(argv[i]);
         } else if (arg == "--strength") {
             if (++i >= argc) {
                 invalid_arg = true;
@@ -501,8 +532,8 @@ void parse_args(int argc, const char** argv, SDParams& params) {
         exit(1);
     }
 
-    if (params.model_path.length() == 0) {
-        fprintf(stderr, "error: the following arguments are required: model_path\n");
+    if (params.model_path.length() == 0 && params.diffusion_model_path.length() == 0) {
+        fprintf(stderr, "error: the following arguments are required: model_path/diffusion_model\n");
         print_usage(argc, argv);
         exit(1);
     }
@@ -570,6 +601,7 @@ std::string get_image_params(SDParams params, int64_t seed) {
     }
     parameter_string += "Steps: " + std::to_string(params.sample_steps) + ", ";
     parameter_string += "CFG scale: " + std::to_string(params.cfg_scale) + ", ";
+    parameter_string += "Guidance: " + std::to_string(params.guidance) + ", ";
     parameter_string += "Seed: " + std::to_string(seed) + ", ";
     parameter_string += "Size: " + std::to_string(params.width) + "x" + std::to_string(params.height) + ", ";
     parameter_string += "Model: " + sd_basename(params.model_path) + ", ";
@@ -717,6 +749,9 @@ int main(int argc, const char* argv[]) {
     }
 
     sd_ctx_t* sd_ctx = new_sd_ctx(params.model_path.c_str(),
+                                  params.clip_l_path.c_str(),
+                                  params.t5xxl_path.c_str(),
+                                  params.diffusion_model_path.c_str(),
                                   params.vae_path.c_str(),
                                   params.taesd_path.c_str(),
                                   params.controlnet_path.c_str(),
@@ -770,6 +805,7 @@ int main(int argc, const char* argv[]) {
                           params.negative_prompt.c_str(),
                           params.clip_skip,
                           params.cfg_scale,
+                          params.guidance,
                           params.width,
                           params.height,
                           params.sample_method,
@@ -830,6 +866,7 @@ int main(int argc, const char* argv[]) {
                               params.negative_prompt.c_str(),
                               params.clip_skip,
                               params.cfg_scale,
+                              params.guidance,
                               params.width,
                               params.height,
                               params.sample_method,