Skip to content

Commit 02ce3dc

Browse files
Merge branch 'leejet:master' into master
2 parents c53acd1 + 10c6501 commit 02ce3dc

24 files changed

+1898
-298
lines changed

.github/workflows/build.yml

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -155,15 +155,15 @@ jobs:
155155
matrix:
156156
include:
157157
- build: "noavx"
158-
defines: "-DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DSD_BUILD_SHARED_LIBS=ON"
158+
defines: "-DGGML_NATIVE=OFF -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DSD_BUILD_SHARED_LIBS=ON"
159159
- build: "avx2"
160-
defines: "-DGGML_AVX2=ON -DSD_BUILD_SHARED_LIBS=ON"
160+
defines: "-DGGML_NATIVE=OFF -DGGML_AVX2=ON -DSD_BUILD_SHARED_LIBS=ON"
161161
- build: "avx"
162-
defines: "-DGGML_AVX2=OFF -DSD_BUILD_SHARED_LIBS=ON"
162+
defines: "-DGGML_NATIVE=OFF -DGGML_AVX=ON -DGGML_AVX2=OFF -DSD_BUILD_SHARED_LIBS=ON"
163163
- build: "avx512"
164-
defines: "-DGGML_AVX512=ON -DSD_BUILD_SHARED_LIBS=ON"
164+
defines: "-DGGML_NATIVE=OFF -DGGML_AVX512=ON -DGGML_AVX=ON -DGGML_AVX2=ON -DSD_BUILD_SHARED_LIBS=ON"
165165
- build: "cuda12"
166-
defines: "-DSD_CUBLAS=ON -DSD_BUILD_SHARED_LIBS=ON"
166+
defines: "-DSD_CUDA=ON -DSD_BUILD_SHARED_LIBS=ON -DCMAKE_CUDA_ARCHITECTURES=90;89;80;75"
167167
# - build: "rocm5.5"
168168
# defines: '-G Ninja -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS="gfx1100;gfx1102;gfx1030" -DSD_BUILD_SHARED_LIBS=ON'
169169
- build: 'vulkan'
@@ -178,9 +178,9 @@ jobs:
178178
- name: Install cuda-toolkit
179179
id: cuda-toolkit
180180
if: ${{ matrix.build == 'cuda12' }}
181-
uses: Jimver/cuda-toolkit@v0.2.11
181+
uses: Jimver/cuda-toolkit@v0.2.19
182182
with:
183-
cuda: "12.2.0"
183+
cuda: "12.6.2"
184184
method: "network"
185185
sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]'
186186

CMakeLists.txt

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -24,19 +24,20 @@ endif()
2424
# general
2525
#option(SD_BUILD_TESTS "sd: build tests" ${SD_STANDALONE})
2626
option(SD_BUILD_EXAMPLES "sd: build examples" ${SD_STANDALONE})
27-
option(SD_CUBLAS "sd: cuda backend" OFF)
27+
option(SD_CUDA "sd: cuda backend" OFF)
2828
option(SD_HIPBLAS "sd: rocm backend" OFF)
2929
option(SD_METAL "sd: metal backend" OFF)
3030
option(SD_VULKAN "sd: vulkan backend" OFF)
3131
option(SD_SYCL "sd: sycl backend" OFF)
32+
option(SD_MUSA "sd: musa backend" OFF)
3233
option(SD_FAST_SOFTMAX "sd: x1.5 faster softmax, indeterministic (sometimes, same seed don't generate same image), cuda only" OFF)
3334
option(SD_BUILD_SHARED_LIBS "sd: build shared libs" OFF)
3435
#option(SD_BUILD_SERVER "sd: build server example" ON)
3536

36-
if(SD_CUBLAS)
37-
message("-- Use CUBLAS as backend stable-diffusion")
37+
if(SD_CUDA)
38+
message("-- Use CUDA as backend stable-diffusion")
3839
set(GGML_CUDA ON)
39-
add_definitions(-DSD_USE_CUBLAS)
40+
add_definitions(-DSD_USE_CUDA)
4041
endif()
4142

4243
if(SD_METAL)
@@ -53,16 +54,25 @@ endif ()
5354

5455
if (SD_HIPBLAS)
5556
message("-- Use HIPBLAS as backend stable-diffusion")
56-
set(GGML_HIPBLAS ON)
57-
add_definitions(-DSD_USE_CUBLAS)
57+
set(GGML_HIP ON)
58+
add_definitions(-DSD_USE_CUDA)
5859
if(SD_FAST_SOFTMAX)
5960
set(GGML_CUDA_FAST_SOFTMAX ON)
6061
endif()
6162
endif ()
6263

64+
if(SD_MUSA)
65+
message("-- Use MUSA as backend stable-diffusion")
66+
set(GGML_MUSA ON)
67+
add_definitions(-DSD_USE_CUDA)
68+
if(SD_FAST_SOFTMAX)
69+
set(GGML_CUDA_FAST_SOFTMAX ON)
70+
endif()
71+
endif()
72+
6373
set(SD_LIB stable-diffusion)
6474

65-
file(GLOB SD_LIB_SOURCES
75+
file(GLOB SD_LIB_SOURCES
6676
"*.h"
6777
"*.cpp"
6878
"*.hpp"
@@ -86,6 +96,7 @@ endif()
8696
if(SD_SYCL)
8797
message("-- Use SYCL as backend stable-diffusion")
8898
set(GGML_SYCL ON)
99+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing -fsycl")
89100
add_definitions(-DSD_USE_SYCL)
90101
# disable fast-math on host, see:
91102
# https://www.intel.com/content/www/us/en/docs/cpp-compiler/developer-guide-reference/2021-10/fp-model-fp.html

Dockerfile.musa

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
ARG MUSA_VERSION=rc3.1.1
2+
3+
FROM mthreads/musa:${MUSA_VERSION}-devel-ubuntu22.04 as build
4+
5+
RUN apt-get update && apt-get install -y cmake
6+
7+
WORKDIR /sd.cpp
8+
9+
COPY . .
10+
11+
RUN mkdir build && cd build && \
12+
cmake .. -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_MUSA=ON -DCMAKE_BUILD_TYPE=Release && \
13+
cmake --build . --config Release
14+
15+
FROM mthreads/musa:${MUSA_VERSION}-runtime-ubuntu22.04 as runtime
16+
17+
COPY --from=build /sd.cpp/build/bin/sd /sd
18+
19+
ENTRYPOINT [ "/sd" ]

README.md

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -113,12 +113,12 @@ cmake .. -DGGML_OPENBLAS=ON
113113
cmake --build . --config Release
114114
```
115115
116-
##### Using CUBLAS
116+
##### Using CUDA
117117
118118
This provides BLAS acceleration using the CUDA cores of your Nvidia GPU. Make sure to have the CUDA toolkit installed. You can download it from your Linux distro's package manager (e.g. `apt install nvidia-cuda-toolkit`) or from here: [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads). Recommended to have at least 4 GB of VRAM.
119119
120120
```
121-
cmake .. -DSD_CUBLAS=ON
121+
cmake .. -DSD_CUDA=ON
122122
cmake --build . --config Release
123123
```
124124
@@ -132,6 +132,14 @@ cmake .. -G "Ninja" -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_H
132132
cmake --build . --config Release
133133
```
134134
135+
##### Using MUSA
136+
137+
This provides BLAS acceleration using the MUSA cores of your Moore Threads GPU. Make sure to have the MUSA toolkit installed.
138+
139+
```bash
140+
cmake .. -DCMAKE_C_COMPILER=/usr/local/musa/bin/clang -DCMAKE_CXX_COMPILER=/usr/local/musa/bin/clang++ -DSD_MUSA=ON -DCMAKE_BUILD_TYPE=Release
141+
cmake --build . --config Release
142+
```
135143

136144
##### Using Metal
137145

@@ -232,6 +240,10 @@ arguments:
232240
-p, --prompt [PROMPT] the prompt to render
233241
-n, --negative-prompt PROMPT the negative prompt (default: "")
234242
--cfg-scale SCALE unconditional guidance scale: (default: 7.0)
243+
--skip-layers LAYERS Layers to skip for SLG steps: (default: [7,8,9])
244+
--skip-layer-start START SLG enabling point: (default: 0.01)
245+
--skip-layer-end END SLG disabling point: (default: 0.2)
246+
SLG will be enabled at step int([STEPS]*[START]) and disabled at int([STEPS]*[END])
235247
--strength STRENGTH strength for noising/unnoising (default: 0.75)
236248
--style-ratio STYLE-RATIO strength for keeping input identity (default: 20%)
237249
--control-strength STRENGTH strength to apply Control Net (default: 0.9)
@@ -314,6 +326,7 @@ These projects use `stable-diffusion.cpp` as a backend for their image generatio
314326

315327
- [Jellybox](https://jellybox.com)
316328
- [Stable Diffusion GUI](https://github.com/fszontagh/sd.cpp.gui.wx)
329+
- [Stable Diffusion CLI-GUI](https://github.com/piallai/stable-diffusion.cpp)
317330

318331
## Contributors
319332

clip.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -546,7 +546,7 @@ class CLIPEmbeddings : public GGMLBlock {
546546
int64_t num_positions;
547547

548548
void init_params(struct ggml_context* ctx, std::map<std::string, enum ggml_type>& tensor_types, const std::string prefix = "") {
549-
enum ggml_type token_wtype = (tensor_types.find(prefix + "token_embedding.weight") != tensor_types.end()) ? tensor_types[prefix + "token_embedding.weight"] : GGML_TYPE_F32;
549+
enum ggml_type token_wtype = GGML_TYPE_F32; //(tensor_types.find(prefix + "token_embedding.weight") != tensor_types.end()) ? tensor_types[prefix + "token_embedding.weight"] : GGML_TYPE_F32;
550550
enum ggml_type position_wtype = GGML_TYPE_F32; //(tensor_types.find(prefix + "position_embedding.weight") != tensor_types.end()) ? tensor_types[prefix + "position_embedding.weight"] : GGML_TYPE_F32;
551551

552552
params["token_embedding.weight"] = ggml_new_tensor_2d(ctx, token_wtype, embed_dim, vocab_size);

conditioner.hpp

Lines changed: 57 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,8 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
5151

5252
std::string trigger_word = "img"; // should be user settable
5353
std::string embd_dir;
54-
int32_t num_custom_embeddings = 0;
54+
int32_t num_custom_embeddings = 0;
55+
int32_t num_custom_embeddings_2 = 0;
5556
std::vector<uint8_t> token_embed_custom;
5657
std::vector<std::string> readed_embeddings;
5758

@@ -61,54 +62,54 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
6162
SDVersion version = VERSION_SD1,
6263
PMVersion pv = PM_VERSION_1,
6364
int clip_skip = -1)
64-
: version(version), pm_version(pv), tokenizer(version == VERSION_SD2 ? 0 : 49407), embd_dir(embd_dir) {
65+
: version(version), pm_version(pv), tokenizer(sd_version_is_sd2(version) ? 0 : 49407), embd_dir(embd_dir) {
6566
if (clip_skip <= 0) {
6667
clip_skip = 1;
67-
if (version == VERSION_SD2 || version == VERSION_SDXL) {
68+
if (sd_version_is_sd2(version) || sd_version_is_sdxl(version)) {
6869
clip_skip = 2;
6970
}
7071
}
71-
if (version == VERSION_SD1) {
72+
if (sd_version_is_sd1(version)) {
7273
text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, clip_skip);
73-
} else if (version == VERSION_SD2) {
74+
} else if (sd_version_is_sd2(version)) {
7475
text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14, clip_skip);
75-
} else if (version == VERSION_SDXL) {
76+
} else if (sd_version_is_sdxl(version)) {
7677
text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, clip_skip, false);
7778
text_model2 = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, clip_skip, false);
7879
}
7980
}
8081

8182
void set_clip_skip(int clip_skip) {
8283
text_model->set_clip_skip(clip_skip);
83-
if (version == VERSION_SDXL) {
84+
if (sd_version_is_sdxl(version)) {
8485
text_model2->set_clip_skip(clip_skip);
8586
}
8687
}
8788

8889
void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
8990
text_model->get_param_tensors(tensors, "cond_stage_model.transformer.text_model");
90-
if (version == VERSION_SDXL) {
91+
if (sd_version_is_sdxl(version)) {
9192
text_model2->get_param_tensors(tensors, "cond_stage_model.1.transformer.text_model");
9293
}
9394
}
9495

9596
void alloc_params_buffer() {
9697
text_model->alloc_params_buffer();
97-
if (version == VERSION_SDXL) {
98+
if (sd_version_is_sdxl(version)) {
9899
text_model2->alloc_params_buffer();
99100
}
100101
}
101102

102103
void free_params_buffer() {
103104
text_model->free_params_buffer();
104-
if (version == VERSION_SDXL) {
105+
if (sd_version_is_sdxl(version)) {
105106
text_model2->free_params_buffer();
106107
}
107108
}
108109

109110
size_t get_params_buffer_size() {
110111
size_t buffer_size = text_model->get_params_buffer_size();
111-
if (version == VERSION_SDXL) {
112+
if (sd_version_is_sdxl(version)) {
112113
buffer_size += text_model2->get_params_buffer_size();
113114
}
114115
return buffer_size;
@@ -131,28 +132,55 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
131132
params.no_alloc = false;
132133
struct ggml_context* embd_ctx = ggml_init(params);
133134
struct ggml_tensor* embd = NULL;
134-
int64_t hidden_size = text_model->model.hidden_size;
135+
struct ggml_tensor* embd2 = NULL;
135136
auto on_load = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) {
136-
if (tensor_storage.ne[0] != hidden_size) {
137-
LOG_DEBUG("embedding wrong hidden size, got %i, expected %i", tensor_storage.ne[0], hidden_size);
138-
return false;
137+
if (tensor_storage.ne[0] != text_model->model.hidden_size) {
138+
if (text_model2) {
139+
if (tensor_storage.ne[0] == text_model2->model.hidden_size) {
140+
embd2 = ggml_new_tensor_2d(embd_ctx, tensor_storage.type, text_model2->model.hidden_size, tensor_storage.n_dims > 1 ? tensor_storage.ne[1] : 1);
141+
*dst_tensor = embd2;
142+
} else {
143+
LOG_DEBUG("embedding wrong hidden size, got %i, expected %i or %i", tensor_storage.ne[0], text_model->model.hidden_size, text_model2->model.hidden_size);
144+
return false;
145+
}
146+
} else {
147+
LOG_DEBUG("embedding wrong hidden size, got %i, expected %i", tensor_storage.ne[0], text_model->model.hidden_size);
148+
return false;
149+
}
150+
} else {
151+
embd = ggml_new_tensor_2d(embd_ctx, tensor_storage.type, text_model->model.hidden_size, tensor_storage.n_dims > 1 ? tensor_storage.ne[1] : 1);
152+
*dst_tensor = embd;
139153
}
140-
embd = ggml_new_tensor_2d(embd_ctx, tensor_storage.type, hidden_size, tensor_storage.n_dims > 1 ? tensor_storage.ne[1] : 1);
141-
*dst_tensor = embd;
142154
return true;
143155
};
144156
model_loader.load_tensors(on_load, NULL);
145157
readed_embeddings.push_back(embd_name);
146-
token_embed_custom.resize(token_embed_custom.size() + ggml_nbytes(embd));
147-
memcpy((void*)(token_embed_custom.data() + num_custom_embeddings * hidden_size * ggml_type_size(embd->type)),
148-
embd->data,
149-
ggml_nbytes(embd));
150-
for (int i = 0; i < embd->ne[1]; i++) {
151-
bpe_tokens.push_back(text_model->model.vocab_size + num_custom_embeddings);
152-
// LOG_DEBUG("new custom token: %i", text_model.vocab_size + num_custom_embeddings);
153-
num_custom_embeddings++;
158+
if (embd) {
159+
int64_t hidden_size = text_model->model.hidden_size;
160+
token_embed_custom.resize(token_embed_custom.size() + ggml_nbytes(embd));
161+
memcpy((void*)(token_embed_custom.data() + num_custom_embeddings * hidden_size * ggml_type_size(embd->type)),
162+
embd->data,
163+
ggml_nbytes(embd));
164+
for (int i = 0; i < embd->ne[1]; i++) {
165+
bpe_tokens.push_back(text_model->model.vocab_size + num_custom_embeddings);
166+
// LOG_DEBUG("new custom token: %i", text_model.vocab_size + num_custom_embeddings);
167+
num_custom_embeddings++;
168+
}
169+
LOG_DEBUG("embedding '%s' applied, custom embeddings: %i", embd_name.c_str(), num_custom_embeddings);
170+
}
171+
if (embd2) {
172+
int64_t hidden_size = text_model2->model.hidden_size;
173+
token_embed_custom.resize(token_embed_custom.size() + ggml_nbytes(embd2));
174+
memcpy((void*)(token_embed_custom.data() + num_custom_embeddings_2 * hidden_size * ggml_type_size(embd2->type)),
175+
embd2->data,
176+
ggml_nbytes(embd2));
177+
for (int i = 0; i < embd2->ne[1]; i++) {
178+
bpe_tokens.push_back(text_model2->model.vocab_size + num_custom_embeddings_2);
179+
// LOG_DEBUG("new custom token: %i", text_model.vocab_size + num_custom_embeddings);
180+
num_custom_embeddings_2++;
181+
}
182+
LOG_DEBUG("embedding '%s' applied, custom embeddings: %i (text model 2)", embd_name.c_str(), num_custom_embeddings_2);
154183
}
155-
LOG_DEBUG("embedding '%s' applied, custom embeddings: %i", embd_name.c_str(), num_custom_embeddings);
156184
return true;
157185
}
158186

@@ -402,7 +430,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
402430
auto input_ids = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens);
403431
struct ggml_tensor* input_ids2 = NULL;
404432
size_t max_token_idx = 0;
405-
if (version == VERSION_SDXL) {
433+
if (sd_version_is_sdxl(version)) {
406434
auto it = std::find(chunk_tokens.begin(), chunk_tokens.end(), tokenizer.EOS_TOKEN_ID);
407435
if (it != chunk_tokens.end()) {
408436
std::fill(std::next(it), chunk_tokens.end(), 0);
@@ -427,7 +455,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
427455
false,
428456
&chunk_hidden_states1,
429457
work_ctx);
430-
if (version == VERSION_SDXL) {
458+
if (sd_version_is_sdxl(version)) {
431459
text_model2->compute(n_threads,
432460
input_ids2,
433461
0,
@@ -486,7 +514,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
486514
ggml_nelements(hidden_states) / chunk_hidden_states->ne[0]);
487515

488516
ggml_tensor* vec = NULL;
489-
if (version == VERSION_SDXL) {
517+
if (sd_version_is_sdxl(version)) {
490518
int out_dim = 256;
491519
vec = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, adm_in_channels);
492520
// [0:1280]

control.hpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -34,11 +34,11 @@ class ControlNetBlock : public GGMLBlock {
3434

3535
ControlNetBlock(SDVersion version = VERSION_SD1)
3636
: version(version) {
37-
if (version == VERSION_SD2) {
37+
if (sd_version_is_sd2(version)) {
3838
context_dim = 1024;
3939
num_head_channels = 64;
4040
num_heads = -1;
41-
} else if (version == VERSION_SDXL) {
41+
} else if (sd_version_is_sdxl(version)) {
4242
context_dim = 2048;
4343
attention_resolutions = {4, 2};
4444
channel_mult = {1, 2, 4};
@@ -58,7 +58,7 @@ class ControlNetBlock : public GGMLBlock {
5858
// time_embed_1 is nn.SiLU()
5959
blocks["time_embed.2"] = std::shared_ptr<GGMLBlock>(new Linear(time_embed_dim, time_embed_dim));
6060

61-
if (version == VERSION_SDXL || version == VERSION_SVD) {
61+
if (sd_version_is_sdxl(version) || version == VERSION_SVD) {
6262
blocks["label_emb.0.0"] = std::shared_ptr<GGMLBlock>(new Linear(adm_in_channels, time_embed_dim));
6363
// label_emb_1 is nn.SiLU()
6464
blocks["label_emb.0.2"] = std::shared_ptr<GGMLBlock>(new Linear(time_embed_dim, time_embed_dim));

0 commit comments

Comments
 (0)