leejet
diff --git a/‎.github/workflows/build.yml
Lines changed: 7 additions & 7 deletions b/‎.github/workflows/build.yml
Lines changed: 7 additions & 7 deletions
diff --git a/‎CMakeLists.txt
Lines changed: 18 additions & 7 deletions b/‎CMakeLists.txt
Lines changed: 18 additions & 7 deletions
diff --git a/‎Dockerfile.musa
Lines changed: 19 additions & 0 deletions b/‎Dockerfile.musa
Lines changed: 19 additions & 0 deletions
diff --git a/‎README.md
Lines changed: 15 additions & 2 deletions b/‎README.md
Lines changed: 15 additions & 2 deletions
diff --git a/‎clip.hpp
Lines changed: 1 addition & 1 deletion b/‎clip.hpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎conditioner.hpp
Lines changed: 57 additions & 29 deletions b/‎conditioner.hpp
Lines changed: 57 additions & 29 deletions
diff --git a/‎control.hpp
Lines changed: 3 additions & 3 deletions b/‎control.hpp
Lines changed: 3 additions & 3 deletions
@@ -155,15 +155,15 @@ jobs:
       matrix:
         include:
           - build: "noavx"
-            defines: "-DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DSD_BUILD_SHARED_LIBS=ON"
+            defines: "-DGGML_NATIVE=OFF -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DSD_BUILD_SHARED_LIBS=ON"
           - build: "avx2"
-            defines: "-DGGML_AVX2=ON -DSD_BUILD_SHARED_LIBS=ON"
+            defines: "-DGGML_NATIVE=OFF -DGGML_AVX2=ON -DSD_BUILD_SHARED_LIBS=ON"
           - build: "avx"
-            defines: "-DGGML_AVX2=OFF -DSD_BUILD_SHARED_LIBS=ON"
+            defines: "-DGGML_NATIVE=OFF -DGGML_AVX=ON -DGGML_AVX2=OFF -DSD_BUILD_SHARED_LIBS=ON"
           - build: "avx512"
-            defines: "-DGGML_AVX512=ON -DSD_BUILD_SHARED_LIBS=ON"
+            defines: "-DGGML_NATIVE=OFF -DGGML_AVX512=ON -DGGML_AVX=ON -DGGML_AVX2=ON -DSD_BUILD_SHARED_LIBS=ON"
           - build: "cuda12"
-            defines: "-DSD_CUBLAS=ON -DSD_BUILD_SHARED_LIBS=ON"
+            defines: "-DSD_CUDA=ON -DSD_BUILD_SHARED_LIBS=ON -DCMAKE_CUDA_ARCHITECTURES=90;89;80;75"
           # - build: "rocm5.5"
           #   defines: '-G Ninja -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS="gfx1100;gfx1102;gfx1030" -DSD_BUILD_SHARED_LIBS=ON'
           - build: 'vulkan'
@@ -178,9 +178,9 @@ jobs:
       - name: Install cuda-toolkit
         id: cuda-toolkit
         if: ${{ matrix.build == 'cuda12' }}
-        uses: Jimver/cuda-toolkit@v0.2.11
+        uses: Jimver/cuda-toolkit@v0.2.19
         with:
-          cuda: "12.2.0"
+          cuda: "12.6.2"
           method: "network"
           sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]'
 
 
@@ -24,19 +24,20 @@ endif()
 # general
 #option(SD_BUILD_TESTS                "sd: build tests"    ${SD_STANDALONE})
 option(SD_BUILD_EXAMPLES             "sd: build examples" ${SD_STANDALONE})
-option(SD_CUBLAS                     "sd: cuda backend" OFF)
+option(SD_CUDA                       "sd: cuda backend" OFF)
 option(SD_HIPBLAS                    "sd: rocm backend" OFF)
 option(SD_METAL                      "sd: metal backend" OFF)
 option(SD_VULKAN                     "sd: vulkan backend" OFF)
 option(SD_SYCL                       "sd: sycl backend" OFF)
+option(SD_MUSA                       "sd: musa backend" OFF)
 option(SD_FAST_SOFTMAX               "sd: x1.5 faster softmax, indeterministic (sometimes, same seed don't generate same image), cuda only" OFF)
 option(SD_BUILD_SHARED_LIBS          "sd: build shared libs" OFF)
 #option(SD_BUILD_SERVER               "sd: build server example"                           ON)
 
-if(SD_CUBLAS)
-    message("-- Use CUBLAS as backend stable-diffusion")
+if(SD_CUDA)
+    message("-- Use CUDA as backend stable-diffusion")
     set(GGML_CUDA ON)
-    add_definitions(-DSD_USE_CUBLAS)
+    add_definitions(-DSD_USE_CUDA)
 endif()
 
 if(SD_METAL)
@@ -53,16 +54,25 @@ endif ()
 
 if (SD_HIPBLAS)
     message("-- Use HIPBLAS as backend stable-diffusion")
-    set(GGML_HIPBLAS ON)
-    add_definitions(-DSD_USE_CUBLAS)
+    set(GGML_HIP ON)
+    add_definitions(-DSD_USE_CUDA)
     if(SD_FAST_SOFTMAX)
         set(GGML_CUDA_FAST_SOFTMAX ON)
     endif()
 endif ()
 
+if(SD_MUSA)
+    message("-- Use MUSA as backend stable-diffusion")
+    set(GGML_MUSA ON)
+    add_definitions(-DSD_USE_CUDA)
+    if(SD_FAST_SOFTMAX)
+        set(GGML_CUDA_FAST_SOFTMAX ON)
+    endif()
+endif()
+
 set(SD_LIB stable-diffusion)
 
-file(GLOB SD_LIB_SOURCES 
+file(GLOB SD_LIB_SOURCES
     "*.h"
     "*.cpp"
     "*.hpp"
@@ -86,6 +96,7 @@ endif()
 if(SD_SYCL)
     message("-- Use SYCL as backend stable-diffusion")
     set(GGML_SYCL ON)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing -fsycl")
     add_definitions(-DSD_USE_SYCL)
     # disable fast-math on host, see:
     # https://www.intel.com/content/www/us/en/docs/cpp-compiler/developer-guide-reference/2021-10/fp-model-fp.html
 
@@ -0,0 +1,19 @@
+ARG MUSA_VERSION=rc3.1.1
+
+FROM mthreads/musa:${MUSA_VERSION}-devel-ubuntu22.04 as build
+
+RUN apt-get update && apt-get install -y cmake
+
+WORKDIR /sd.cpp
+
+COPY . .
+
+RUN mkdir build && cd build && \
+    cmake .. -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_MUSA=ON -DCMAKE_BUILD_TYPE=Release && \
+    cmake --build . --config Release
+
+FROM mthreads/musa:${MUSA_VERSION}-runtime-ubuntu22.04 as runtime
+
+COPY --from=build /sd.cpp/build/bin/sd /sd
+
+ENTRYPOINT [ "/sd" ]
@@ -113,12 +113,12 @@ cmake .. -DGGML_OPENBLAS=ON
 cmake --build . --config Release
 ```
 
-##### Using CUBLAS
+##### Using CUDA
 
 This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads). Recommended to have at least 4 GB of VRAM.
 
 ```
-cmake .. -DSD_CUBLAS=ON
+cmake .. -DSD_CUDA=ON
 cmake --build . --config Release
 ```
 
@@ -132,6 +132,14 @@ cmake .. -G "Ninja" -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_H
 cmake --build . --config Release
 ```
 
+##### Using MUSA
+
+This provides BLAS acceleration using the MUSA cores of your Moore Threads GPU. Make sure to have the MUSA toolkit installed.
+
+```bash
+cmake .. -DCMAKE_C_COMPILER=/usr/local/musa/bin/clang -DCMAKE_CXX_COMPILER=/usr/local/musa/bin/clang++ -DSD_MUSA=ON -DCMAKE_BUILD_TYPE=Release
+cmake --build . --config Release
+```
 
 ##### Using Metal
 
@@ -232,6 +240,10 @@ arguments:
   -p, --prompt [PROMPT]              the prompt to render
   -n, --negative-prompt PROMPT       the negative prompt (default: "")
   --cfg-scale SCALE                  unconditional guidance scale: (default: 7.0)
+  --skip-layers LAYERS               Layers to skip for SLG steps: (default: [7,8,9])
+  --skip-layer-start START           SLG enabling point: (default: 0.01)
+  --skip-layer-end END               SLG disabling point: (default: 0.2)
+									 SLG will be enabled at step int([STEPS]*[START]) and disabled at int([STEPS]*[END])
   --strength STRENGTH                strength for noising/unnoising (default: 0.75)
   --style-ratio STYLE-RATIO          strength for keeping input identity (default: 20%)
   --control-strength STRENGTH        strength to apply Control Net (default: 0.9)
@@ -314,6 +326,7 @@ These projects use `stable-diffusion.cpp` as a backend for their image generatio
 
 - [Jellybox](https://jellybox.com)
 - [Stable Diffusion GUI](https://github.com/fszontagh/sd.cpp.gui.wx)
+- [Stable Diffusion CLI-GUI](https://github.com/piallai/stable-diffusion.cpp)
 
 ## Contributors
 
 
@@ -546,7 +546,7 @@ class CLIPEmbeddings : public GGMLBlock {
     int64_t num_positions;
 
     void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = "") {
-        enum ggml_type token_wtype    = (tensor_types.find(prefix + "token_embedding.weight") != tensor_types.end()) ? tensor_types[prefix + "token_embedding.weight"] : GGML_TYPE_F32;
+        enum ggml_type token_wtype    = GGML_TYPE_F32;  //(tensor_types.find(prefix + "token_embedding.weight") != tensor_types.end()) ? tensor_types[prefix + "token_embedding.weight"] : GGML_TYPE_F32;
         enum ggml_type position_wtype = GGML_TYPE_F32;  //(tensor_types.find(prefix + "position_embedding.weight") != tensor_types.end()) ? tensor_types[prefix + "position_embedding.weight"] : GGML_TYPE_F32;
 
         params["token_embedding.weight"]    = ggml_new_tensor_2d(ctx, token_wtype, embed_dim, vocab_size);
 
@@ -51,7 +51,8 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
 
     std::string trigger_word = "img";  // should be user settable
     std::string embd_dir;
-    int32_t num_custom_embeddings = 0;
+    int32_t num_custom_embeddings   = 0;
+    int32_t num_custom_embeddings_2 = 0;
     std::vector<uint8_t> token_embed_custom;
     std::vector<std::string> readed_embeddings;
 
@@ -61,54 +62,54 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                                       SDVersion version = VERSION_SD1,
                                       PMVersion pv      = PM_VERSION_1,
                                       int clip_skip     = -1)
-        : version(version), pm_version(pv), tokenizer(version == VERSION_SD2 ? 0 : 49407), embd_dir(embd_dir) {
+        : version(version), pm_version(pv), tokenizer(sd_version_is_sd2(version) ? 0 : 49407), embd_dir(embd_dir) {
         if (clip_skip <= 0) {
             clip_skip = 1;
-            if (version == VERSION_SD2 || version == VERSION_SDXL) {
+            if (sd_version_is_sd2(version) || sd_version_is_sdxl(version)) {
                 clip_skip = 2;
             }
         }
-        if (version == VERSION_SD1) {
+        if (sd_version_is_sd1(version)) {
             text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, clip_skip);
-        } else if (version == VERSION_SD2) {
+        } else if (sd_version_is_sd2(version)) {
             text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14, clip_skip);
-        } else if (version == VERSION_SDXL) {
+        } else if (sd_version_is_sdxl(version)) {
             text_model  = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, clip_skip, false);
             text_model2 = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, clip_skip, false);
         }
     }
 
     void set_clip_skip(int clip_skip) {
         text_model->set_clip_skip(clip_skip);
-        if (version == VERSION_SDXL) {
+        if (sd_version_is_sdxl(version)) {
             text_model2->set_clip_skip(clip_skip);
         }
     }
 
     void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
         text_model->get_param_tensors(tensors, "cond_stage_model.transformer.text_model");
-        if (version == VERSION_SDXL) {
+        if (sd_version_is_sdxl(version)) {
             text_model2->get_param_tensors(tensors, "cond_stage_model.1.transformer.text_model");
         }
     }
 
     void alloc_params_buffer() {
         text_model->alloc_params_buffer();
-        if (version == VERSION_SDXL) {
+        if (sd_version_is_sdxl(version)) {
             text_model2->alloc_params_buffer();
         }
     }
 
     void free_params_buffer() {
         text_model->free_params_buffer();
-        if (version == VERSION_SDXL) {
+        if (sd_version_is_sdxl(version)) {
             text_model2->free_params_buffer();
         }
     }
 
     size_t get_params_buffer_size() {
         size_t buffer_size = text_model->get_params_buffer_size();
-        if (version == VERSION_SDXL) {
+        if (sd_version_is_sdxl(version)) {
             buffer_size += text_model2->get_params_buffer_size();
         }
         return buffer_size;
@@ -131,28 +132,55 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
         params.no_alloc               = false;
         struct ggml_context* embd_ctx = ggml_init(params);
         struct ggml_tensor* embd      = NULL;
-        int64_t hidden_size           = text_model->model.hidden_size;
+        struct ggml_tensor* embd2     = NULL;
         auto on_load                  = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) {
-            if (tensor_storage.ne[0] != hidden_size) {
-                LOG_DEBUG("embedding wrong hidden size, got %i, expected %i", tensor_storage.ne[0], hidden_size);
-                return false;
+            if (tensor_storage.ne[0] != text_model->model.hidden_size) {
+                if (text_model2) {
+                    if (tensor_storage.ne[0] == text_model2->model.hidden_size) {
+                        embd2       = ggml_new_tensor_2d(embd_ctx, tensor_storage.type, text_model2->model.hidden_size, tensor_storage.n_dims > 1 ? tensor_storage.ne[1] : 1);
+                        *dst_tensor = embd2;
+                    } else {
+                        LOG_DEBUG("embedding wrong hidden size, got %i, expected %i or %i", tensor_storage.ne[0], text_model->model.hidden_size, text_model2->model.hidden_size);
+                        return false;
+                    }
+                } else {
+                    LOG_DEBUG("embedding wrong hidden size, got %i, expected %i", tensor_storage.ne[0], text_model->model.hidden_size);
+                    return false;
+                }
+            } else {
+                embd        = ggml_new_tensor_2d(embd_ctx, tensor_storage.type, text_model->model.hidden_size, tensor_storage.n_dims > 1 ? tensor_storage.ne[1] : 1);
+                *dst_tensor = embd;
             }
-            embd        = ggml_new_tensor_2d(embd_ctx, tensor_storage.type, hidden_size, tensor_storage.n_dims > 1 ? tensor_storage.ne[1] : 1);
-            *dst_tensor = embd;
             return true;
         };
         model_loader.load_tensors(on_load, NULL);
         readed_embeddings.push_back(embd_name);
-        token_embed_custom.resize(token_embed_custom.size() + ggml_nbytes(embd));
-        memcpy((void*)(token_embed_custom.data() + num_custom_embeddings * hidden_size * ggml_type_size(embd->type)),
-               embd->data,
-               ggml_nbytes(embd));
-        for (int i = 0; i < embd->ne[1]; i++) {
-            bpe_tokens.push_back(text_model->model.vocab_size + num_custom_embeddings);
-            // LOG_DEBUG("new custom token: %i", text_model.vocab_size + num_custom_embeddings);
-            num_custom_embeddings++;
+        if (embd) {
+            int64_t hidden_size = text_model->model.hidden_size;
+            token_embed_custom.resize(token_embed_custom.size() + ggml_nbytes(embd));
+            memcpy((void*)(token_embed_custom.data() + num_custom_embeddings * hidden_size * ggml_type_size(embd->type)),
+                   embd->data,
+                   ggml_nbytes(embd));
+            for (int i = 0; i < embd->ne[1]; i++) {
+                bpe_tokens.push_back(text_model->model.vocab_size + num_custom_embeddings);
+                // LOG_DEBUG("new custom token: %i", text_model.vocab_size + num_custom_embeddings);
+                num_custom_embeddings++;
+            }
+            LOG_DEBUG("embedding '%s' applied, custom embeddings: %i", embd_name.c_str(), num_custom_embeddings);
+        }
+        if (embd2) {
+            int64_t hidden_size = text_model2->model.hidden_size;
+            token_embed_custom.resize(token_embed_custom.size() + ggml_nbytes(embd2));
+            memcpy((void*)(token_embed_custom.data() + num_custom_embeddings_2 * hidden_size * ggml_type_size(embd2->type)),
+                   embd2->data,
+                   ggml_nbytes(embd2));
+            for (int i = 0; i < embd2->ne[1]; i++) {
+                bpe_tokens.push_back(text_model2->model.vocab_size + num_custom_embeddings_2);
+                // LOG_DEBUG("new custom token: %i", text_model.vocab_size + num_custom_embeddings);
+                num_custom_embeddings_2++;
+            }
+            LOG_DEBUG("embedding '%s' applied, custom embeddings: %i (text model 2)", embd_name.c_str(), num_custom_embeddings_2);
         }
-        LOG_DEBUG("embedding '%s' applied, custom embeddings: %i", embd_name.c_str(), num_custom_embeddings);
         return true;
     }
 
@@ -402,7 +430,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
             auto input_ids                 = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens);
             struct ggml_tensor* input_ids2 = NULL;
             size_t max_token_idx           = 0;
-            if (version == VERSION_SDXL) {
+            if (sd_version_is_sdxl(version)) {
                 auto it = std::find(chunk_tokens.begin(), chunk_tokens.end(), tokenizer.EOS_TOKEN_ID);
                 if (it != chunk_tokens.end()) {
                     std::fill(std::next(it), chunk_tokens.end(), 0);
@@ -427,7 +455,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                                     false,
                                     &chunk_hidden_states1,
                                     work_ctx);
-                if (version == VERSION_SDXL) {
+                if (sd_version_is_sdxl(version)) {
                     text_model2->compute(n_threads,
                                          input_ids2,
                                          0,
@@ -486,7 +514,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                                         ggml_nelements(hidden_states) / chunk_hidden_states->ne[0]);
 
         ggml_tensor* vec = NULL;
-        if (version == VERSION_SDXL) {
+        if (sd_version_is_sdxl(version)) {
             int out_dim = 256;
             vec         = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, adm_in_channels);
             // [0:1280]
 
@@ -34,11 +34,11 @@ class ControlNetBlock : public GGMLBlock {
 
     ControlNetBlock(SDVersion version = VERSION_SD1)
         : version(version) {
-        if (version == VERSION_SD2) {
+        if (sd_version_is_sd2(version)) {
             context_dim       = 1024;
             num_head_channels = 64;
             num_heads         = -1;
-        } else if (version == VERSION_SDXL) {
+        } else if (sd_version_is_sdxl(version)) {
             context_dim           = 2048;
             attention_resolutions = {4, 2};
             channel_mult          = {1, 2, 4};
@@ -58,7 +58,7 @@ class ControlNetBlock : public GGMLBlock {
         // time_embed_1 is nn.SiLU()
         blocks["time_embed.2"] = std::shared_ptr<GGMLBlock>(new Linear(time_embed_dim, time_embed_dim));
 
-        if (version == VERSION_SDXL || version == VERSION_SVD) {
+        if (sd_version_is_sdxl(version) || version == VERSION_SVD) {
             blocks["label_emb.0.0"] = std::shared_ptr<GGMLBlock>(new Linear(adm_in_channels, time_embed_dim));
             // label_emb_1 is nn.SiLU()
             blocks["label_emb.0.2"] = std::shared_ptr<GGMLBlock>(new Linear(time_embed_dim, time_embed_dim));