leejet
diff --git a/‎.clang-format
Lines changed: 0 additions & 1 deletion b/‎.clang-format
Lines changed: 0 additions & 1 deletion
diff --git a/‎.gitignore
Lines changed: 0 additions & 1 deletion b/‎.gitignore
Lines changed: 0 additions & 1 deletion
diff --git a/‎CMakeLists.txt
Lines changed: 2 additions & 5 deletions b/‎CMakeLists.txt
Lines changed: 2 additions & 5 deletions
diff --git a/‎README.md
Lines changed: 6 additions & 2 deletions b/‎README.md
Lines changed: 6 additions & 2 deletions
diff --git a/‎assets/control.png
4.28 KB b/‎assets/control.png
4.28 KB
diff --git a/‎assets/control_2.png
6.09 KB b/‎assets/control_2.png
6.09 KB
diff --git a/‎assets/control_3.png
18.4 KB b/‎assets/control_3.png
18.4 KB
diff --git a/‎clip.hpp
Lines changed: 137 additions & 16 deletions b/‎clip.hpp
Lines changed: 137 additions & 16 deletions
@@ -3,7 +3,6 @@ UseTab: Never
 IndentWidth: 4
 TabWidth: 4
 AllowShortIfStatementsOnASingleLine: false
-IndentCaseLabels: false
 ColumnLimit: 0
 AccessModifierOffset: -4
 NamespaceIndentation: All
 
@@ -10,5 +10,4 @@ test/
 *.gguf
 output*.png
 models*
-!taesd-model.gguf
 *.log
@@ -28,17 +28,13 @@ option(SD_CUBLAS                     "sd: cuda backend" OFF)
 option(SD_HIPBLAS                    "sd: rocm backend" OFF)
 option(SD_METAL                      "sd: metal backend" OFF)
 option(SD_FLASH_ATTN                 "sd: use flash attention for x4 less memory usage" OFF)
-option(SD_FAST_SOFTMAX               "sd: x1.5 faster softmax, indeterministic (sometimes, same seed don't generate same image), cuda only" OFF)
 option(BUILD_SHARED_LIBS             "sd: build shared libs" OFF)
 #option(SD_BUILD_SERVER               "sd: build server example"                           ON)
 
 if(SD_CUBLAS)
 	message("Use CUBLAS as backend stable-diffusion")
     set(GGML_CUBLAS ON)
     add_definitions(-DSD_USE_CUBLAS)
-    if(SD_FAST_SOFTMAX)
-        set(GGML_CUDA_FAST_SOFTMAX ON)
-    endif()
 endif()
 
 if(SD_METAL)
@@ -64,7 +60,8 @@ endif()
 set(SD_LIB stable-diffusion)
 
 add_library(${SD_LIB} stable-diffusion.h stable-diffusion.cpp model.h model.cpp util.h util.cpp upscaler.cpp tensor.hpp
-             ggml_extend.hpp clip.hpp common.hpp unet.hpp tae.hpp esrgan.hpp lora.hpp denoiser.hpp rng.hpp rng_philox.hpp)
+             ggml_extend.hpp clip.hpp common.hpp unet.hpp tae.hpp esrgan.hpp lora.hpp denoiser.hpp rng.hpp rng_philox.hpp
+             control.hpp preprocessing.hpp)
 
 if(BUILD_SHARED_LIBS)
     message("Build shared library")
 
@@ -31,6 +31,7 @@ Inference of [Stable Diffusion](https://github.com/CompVis/stable-diffusion) in
 - Faster and memory efficient latent decoding with [TAESD](https://github.com/madebyollin/taesd)
 - Upscale images generated with [ESRGAN](https://github.com/xinntao/Real-ESRGAN)
 - VAE tiling processing for reduce memory usage
+- Control Net support with SD 1.5
 - Sampling method
     - `Euler A`
     - `Euler`
@@ -53,9 +54,7 @@ Inference of [Stable Diffusion](https://github.com/CompVis/stable-diffusion) in
 - [ ] More sampling methods
 - [ ] Make inference faster
     - The current implementation of ggml_conv_2d is slow and has high memory usage
-    - Implement Winograd Convolution 2D for 3x3 kernel filtering
 - [ ] Continuing to reduce memory usage (quantizing the weights of ggml_conv_2d)
-- [ ] Implement Textual Inversion (embeddings)
 - [ ] Implement Inpainting support
 - [ ] k-quants support
 
@@ -159,16 +158,20 @@ arguments:
   -m, --model [MODEL]                path to model
   --vae [VAE]                        path to vae
   --taesd [TAESD_PATH]               path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)
+  --control-net [CONTROL_PATH]       path to control net model
+  --embd-dir [EMBEDDING_PATH]        path to embeddings.
   --upscale-model [ESRGAN_PATH]      path to esrgan model. Upscale images after generate, just RealESRGAN_x4plus_anime_6B supported by now.
   --type [TYPE]                      weight type (f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0)
                                      If not specified, the default is the type of the weight file.
   --lora-model-dir [DIR]             lora model directory
   -i, --init-img [IMAGE]             path to the input image, required by img2img
+  --control-image [IMAGE]            path to image condition, control net
   -o, --output OUTPUT                path to write result image to (default: ./output.png)
   -p, --prompt [PROMPT]              the prompt to render
   -n, --negative-prompt PROMPT       the negative prompt (default: "")
   --cfg-scale SCALE                  unconditional guidance scale: (default: 7.0)
   --strength STRENGTH                strength for noising/unnoising (default: 0.75)
+  --control-strength STRENGTH        strength to apply Control Net (default: 0.9)
                                      1.0 corresponds to full destruction of information in init image
   -H, --height H                     image height, in pixel space (default: 512)
   -W, --width W                      image width, in pixel space (default: 512)
@@ -182,6 +185,7 @@ arguments:
   --clip-skip N                      ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1)
                                      <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x
   --vae-tiling                       process vae in tiles to reduce memory usage
+  --control-net-cpu                  keep controlnet in cpu (for low vram)
   -v, --verbose                      print extra info
 ```
 
 
@@ -2,6 +2,7 @@
 #define __CLIP_HPP__
 
 #include "ggml_extend.hpp"
+#include "model.h"
 
 /*================================================== CLIPTokenizer ===================================================*/
 
@@ -67,6 +68,9 @@ std::vector<std::pair<int, std::u32string>> bytes_to_unicode() {
 }
 
 // Ref: https://github.com/openai/CLIP/blob/main/clip/simple_tokenizer.py
+
+typedef std::function<bool(std::string&, std::vector<int32_t>&)> on_new_token_cb_t;
+
 class CLIPTokenizer {
 private:
     SDVersion version = VERSION_1_x;
@@ -234,8 +238,11 @@ class CLIPTokenizer {
         return result;
     }
 
-    std::vector<int> tokenize(std::string text, size_t max_length = 0, bool padding = false) {
-        std::vector<int32_t> tokens = encode(text);
+    std::vector<int> tokenize(std::string text,
+                              on_new_token_cb_t on_new_token_cb,
+                              size_t max_length = 0,
+                              bool padding      = false) {
+        std::vector<int32_t> tokens = encode(text, on_new_token_cb);
         tokens.insert(tokens.begin(), BOS_TOKEN_ID);
         if (max_length > 0) {
             if (tokens.size() > max_length - 1) {
@@ -255,7 +262,7 @@ class CLIPTokenizer {
         return tokens;
     }
 
-    std::vector<int> encode(std::string text) {
+    std::vector<int> encode(std::string text, on_new_token_cb_t on_new_token_cb) {
         std::string original_text = text;
         std::vector<int32_t> bpe_tokens;
         text = whitespace_clean(text);
@@ -268,6 +275,10 @@ class CLIPTokenizer {
         std::string str = text;
         std::vector<std::string> token_strs;
         while (std::regex_search(str, matches, pat)) {
+            bool skip = on_new_token_cb(str, bpe_tokens);
+            if (skip) {
+                continue;
+            }
             for (auto& token : matches) {
                 std::string token_str = token.str();
                 std::u32string utf32_token;
@@ -536,7 +547,13 @@ class CLIPEmbeddings : public GGMLBlock {
           num_positions(num_positions) {
     }
 
-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* input_ids) {
+    struct ggml_tensor* get_token_embed_weight() {
+        return params["token_embedding.weight"];
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx,
+                                struct ggml_tensor* input_ids,
+                                struct ggml_tensor* custom_embed_weight) {
         // input_ids: [N, n_token]
         auto token_embed_weight    = params["token_embedding.weight"];
         auto position_embed_weight = params["position_embedding.weight"];
@@ -545,7 +562,7 @@ class CLIPEmbeddings : public GGMLBlock {
 
         // token_embedding + position_embedding
         auto x = ggml_add(ctx,
-                          ggml_get_rows(ctx, token_embed_weight, input_ids),
+                          ggml_get_rows(ctx, custom_embed_weight != NULL ? custom_embed_weight : token_embed_weight, input_ids),
                           position_embed_weight);  // [N, n_token, embed_dim]
         return x;
     }
@@ -667,14 +684,23 @@ class CLIPTextModel : public GGMLBlock {
         clip_skip = skip;
     }
 
-    struct ggml_tensor* forward(struct ggml_context* ctx, struct ggml_tensor* input_ids, size_t max_token_idx = 0, bool return_pooled = false) {
+    struct ggml_tensor* get_token_embed_weight() {
+        auto embeddings = std::dynamic_pointer_cast<CLIPEmbeddings>(blocks["embeddings"]);
+        return embeddings->get_token_embed_weight();
+    }
+
+    struct ggml_tensor* forward(struct ggml_context* ctx,
+                                struct ggml_tensor* input_ids,
+                                struct ggml_tensor* tkn_embeddings,
+                                size_t max_token_idx = 0,
+                                bool return_pooled   = false) {
         // input_ids: [N, n_token]
         auto embeddings       = std::dynamic_pointer_cast<CLIPEmbeddings>(blocks["embeddings"]);
         auto encoder          = std::dynamic_pointer_cast<CLIPEncoder>(blocks["encoder"]);
         auto final_layer_norm = std::dynamic_pointer_cast<LayerNorm>(blocks["final_layer_norm"]);
 
-        auto x = embeddings->forward(ctx, input_ids);  // [N, n_token, hidden_size]
-        x = encoder->forward(ctx, x, return_pooled ? -1 : clip_skip, true);
+        auto x = embeddings->forward(ctx, input_ids, tkn_embeddings);  // [N, n_token, hidden_size]
+        x      = encoder->forward(ctx, x, return_pooled ? -1 : clip_skip, true);
         if (return_pooled || with_final_ln) {
             x = final_layer_norm->forward(ctx, x);
         }
@@ -695,6 +721,7 @@ class CLIPVisionModel : public GGMLBlock {
     void init_params(struct ggml_context* ctx, ggml_type wtype) {
         params["visual_projection"] = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, projection_dim, hidden_size);
     }
+
 public:
     // network hparams
     int32_t num_channels      = 3;
@@ -742,10 +769,10 @@ class CLIPVisionModel : public GGMLBlock {
         x      = post_layernorm->forward(ctx, x);  // [N, n_token, hidden_size]
 
         GGML_ASSERT(x->ne[2] == 1);
-        int64_t max_token_idx = 0;
-        ggml_tensor* pooled   = ggml_view_1d(ctx, x, x->ne[0], x->nb[1] * max_token_idx);  // assert N == 1
+        int64_t max_token_idx  = 0;
+        ggml_tensor* pooled    = ggml_view_1d(ctx, x, x->ne[0], x->nb[1] * max_token_idx);  // assert N == 1
         auto visual_projection = params["visual_projection"];
-        pooled               = ggml_mul_mat(ctx, ggml_cont(ctx, ggml_transpose(ctx, visual_projection)), pooled);
+        pooled                 = ggml_mul_mat(ctx, ggml_cont(ctx, ggml_transpose(ctx, visual_projection)), pooled);
         return pooled;  // [N, projection_dim]
     }
 };
@@ -790,6 +817,11 @@ struct FrozenCLIPEmbedderWithCustomWords : public GGMLModule {
     CLIPTextModel text_model;
     CLIPTextModel text_model2;
 
+    std::string embd_dir;
+    int32_t num_custom_embeddings = 0;
+    std::vector<uint8_t> token_embed_custom;
+    std::vector<std::string> readed_embeddings;
+
     FrozenCLIPEmbedderWithCustomWords(ggml_backend_t backend,
                                       ggml_type wtype,
                                       SDVersion version = VERSION_1_x,
@@ -849,15 +881,53 @@ struct FrozenCLIPEmbedderWithCustomWords : public GGMLModule {
         }
     }
 
+    bool load_embedding(std::string embd_name, std::string embd_path, std::vector<int32_t>& bpe_tokens) {
+        // the order matters
+        ModelLoader model_loader;
+        if (!model_loader.init_from_file(embd_path)) {
+            LOG_ERROR("embedding '%s' failed", embd_name.c_str());
+            return false;
+        }
+        struct ggml_init_params params;
+        params.mem_size               = 32 * 1024;  // max for custom embeddings 32 KB
+        params.mem_buffer             = NULL;
+        params.no_alloc               = false;
+        struct ggml_context* embd_ctx = ggml_init(params);
+        struct ggml_tensor* embd      = NULL;
+        auto on_load                  = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) {
+            if (tensor_storage.ne[0] != text_model.hidden_size) {
+                LOG_DEBUG("embedding wrong hidden size, got %i, expected %i", tensor_storage.ne[0], text_model.hidden_size);
+                return false;
+            }
+            embd        = ggml_new_tensor_2d(embd_ctx, wtype, text_model.hidden_size, tensor_storage.n_dims > 1 ? tensor_storage.ne[1] : 1);
+            *dst_tensor = embd;
+            return true;
+        };
+        model_loader.load_tensors(on_load, NULL);
+        readed_embeddings.push_back(embd_name);
+        token_embed_custom.resize(token_embed_custom.size() + ggml_nbytes(embd));
+        memcpy((void*)(token_embed_custom.data() + num_custom_embeddings * text_model.hidden_size * ggml_type_size(wtype)),
+               embd->data,
+               ggml_nbytes(embd));
+        for (int i = 0; i < embd->ne[1]; i++) {
+            bpe_tokens.push_back(text_model.vocab_size + num_custom_embeddings);
+            // LOG_DEBUG("new custom token: %i", text_model.vocab_size + num_custom_embeddings);
+            num_custom_embeddings++;
+        }
+        LOG_DEBUG("embedding '%s' applied, custom embeddings: %i", embd_name.c_str(), num_custom_embeddings);
+        return true;
+    }
+
     struct ggml_tensor* forward(struct ggml_context* ctx,
                                 struct ggml_tensor* input_ids,
                                 struct ggml_tensor* input_ids2,
+                                struct ggml_tensor* embeddings,
                                 size_t max_token_idx = 0,
                                 bool return_pooled   = false) {
         if (return_pooled) {
-            return text_model2.forward(ctx, input_ids2, max_token_idx, return_pooled);
+            return text_model2.forward(ctx, input_ids2, NULL, max_token_idx, return_pooled);
         }
-        auto hidden_states = text_model.forward(ctx, input_ids);  // [N, n_token, hidden_size]
+        auto hidden_states = text_model.forward(ctx, input_ids, embeddings);  // [N, n_token, hidden_size]
         // LOG_DEBUG("hidden_states: %d %d %d %d", hidden_states->ne[0], hidden_states->ne[1], hidden_states->ne[2], hidden_states->ne[3]);
         if (version == VERSION_XL) {
             hidden_states = ggml_reshape_4d(ctx,
@@ -868,7 +938,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public GGMLModule {
                                             hidden_states->ne[3]);
             hidden_states = ggml_cont(ctx, ggml_permute(ctx, hidden_states, 2, 0, 1, 3));
 
-            auto hidden_states2 = text_model2.forward(ctx, input_ids2);  // [N, n_token, hidden_size2]
+            auto hidden_states2 = text_model2.forward(ctx, input_ids2, NULL);  // [N, n_token, hidden_size2]
             // LOG_DEBUG("hidden_states: %d %d %d %d", hidden_states->ne[0], hidden_states->ne[1], hidden_states->ne[2], hidden_states->ne[3]);
             hidden_states2 = ggml_reshape_4d(ctx,
                                              hidden_states2,
@@ -919,7 +989,34 @@ struct FrozenCLIPEmbedderWithCustomWords : public GGMLModule {
             }
         }
 
-        struct ggml_tensor* hidden_states = forward(compute_ctx, input_ids, input_ids2, max_token_idx, return_pooled);
+        struct ggml_tensor* embeddings = NULL;
+
+        if (num_custom_embeddings > 0 && version != VERSION_XL) {
+            embeddings = ggml_new_tensor_2d(compute_ctx,
+                                            wtype,
+                                            text_model.hidden_size,
+                                            text_model.vocab_size + num_custom_embeddings /* custom placeholder */);
+            ggml_allocr_alloc(allocr, embeddings);
+            if (!ggml_allocr_is_measure(allocr)) {
+                // really bad, there is memory inflexibility (this is for host<->device memory conflicts)
+                auto token_embed_weight = text_model.get_token_embed_weight();
+                void* freeze_data       = malloc(ggml_nbytes(token_embed_weight));
+                ggml_backend_tensor_get_and_sync(backend,
+                                                 token_embed_weight,
+                                                 freeze_data,
+                                                 0,
+                                                 ggml_nbytes(token_embed_weight));
+                ggml_backend_tensor_set(embeddings, freeze_data, 0, ggml_nbytes(token_embed_weight));
+                free(freeze_data);
+                // concatenate custom embeddings
+                ggml_backend_tensor_set(embeddings,
+                                        (const void*)token_embed_custom.data(),
+                                        ggml_nbytes(token_embed_weight),
+                                        num_custom_embeddings * text_model.hidden_size * ggml_type_size(wtype));
+            }
+        }
+
+        struct ggml_tensor* hidden_states = forward(compute_ctx, input_ids, input_ids2, embeddings, max_token_idx, return_pooled);
 
         ggml_build_forward_expand(gf, hidden_states);
 
@@ -957,12 +1054,36 @@ struct FrozenCLIPEmbedderWithCustomWords : public GGMLModule {
             LOG_DEBUG("parse '%s' to %s", text.c_str(), ss.str().c_str());
         }
 
+        auto on_new_token_cb = [&](std::string& str, std::vector<int32_t>& bpe_tokens) -> bool {
+            size_t word_end       = str.find(",");
+            std::string embd_name = word_end == std::string::npos ? str : str.substr(0, word_end);
+            embd_name             = trim(embd_name);
+            std::string embd_path = get_full_path(embd_dir, embd_name + ".pt");
+            if (embd_path.size() == 0) {
+                embd_path = get_full_path(embd_dir, embd_name + ".ckpt");
+            }
+            if (embd_path.size() == 0) {
+                embd_path = get_full_path(embd_dir, embd_name + ".safetensors");
+            }
+            if (embd_path.size() > 0) {
+                if (load_embedding(embd_name, embd_path, bpe_tokens)) {
+                    if (word_end != std::string::npos) {
+                        str = str.substr(word_end);
+                    } else {
+                        str = "";
+                    }
+                    return true;
+                }
+            }
+            return false;
+        };
+
         std::vector<int> tokens;
         std::vector<float> weights;
         for (const auto& item : parsed_attention) {
             const std::string& curr_text = item.first;
             float curr_weight            = item.second;
-            std::vector<int> curr_tokens = tokenizer.encode(curr_text);
+            std::vector<int> curr_tokens = tokenizer.encode(curr_text, on_new_token_cb);
             tokens.insert(tokens.end(), curr_tokens.begin(), curr_tokens.end());
             weights.insert(weights.end(), curr_tokens.size(), curr_weight);
         }