Skip to content

Commit 366665b

Browse files
committed
feat(tx): support multiple devices
Signed-off-by: thxCode <thxcode0824@gmail.com>
1 parent 701573d commit 366665b

File tree

3 files changed

+259
-117
lines changed

3 files changed

+259
-117
lines changed

stable-diffusion.cpp

Lines changed: 169 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,7 @@ const char* sampling_methods_str[] = {
5050
"iPNDM",
5151
"iPNDM_v",
5252
"LCM",
53-
"DDIM \"trailing\""
54-
};
53+
"DDIM \"trailing\""};
5554

5655
/*================================================== Helper Functions ================================================*/
5756

@@ -143,8 +142,8 @@ class StableDiffusionGGML {
143142
public:
144143
ggml_backend_t backend = NULL; // general backend
145144
ggml_backend_t clip_backend = NULL;
146-
ggml_backend_t control_net_backend = NULL;
147145
ggml_backend_t vae_backend = NULL;
146+
ggml_backend_t control_net_backend = NULL;
148147
ggml_type model_wtype = GGML_TYPE_COUNT;
149148
ggml_type clip_l_wtype = GGML_TYPE_COUNT;
150149
ggml_type clip_g_wtype = GGML_TYPE_COUNT;
@@ -232,55 +231,157 @@ class StableDiffusionGGML {
232231
bool vae_on_cpu,
233232
bool diffusion_flash_attn,
234233
bool tae_preview_only,
235-
int main_gpu) {
234+
const std::vector<std::string>& rpc_servers,
235+
const float* tensor_split) {
236236
use_tiny_autoencoder = taesd_path.size() > 0;
237237

238238
ggml_log_set(ggml_log_callback_default, nullptr);
239-
#ifdef SD_USE_CUDA
240-
#ifdef SD_USE_HIP
241-
LOG_DEBUG("Using HIP backend");
242-
#elif defined(SD_USE_MUSA)
243-
LOG_DEBUG("Using MUSA backend");
244-
#else
245-
LOG_DEBUG("Using CUDA backend");
246-
#endif
247-
backend = ggml_backend_cuda_init(main_gpu);
248-
if (!backend) {
249-
LOG_ERROR("CUDA backend init failed");
250-
}
251-
#endif
252-
#ifdef SD_USE_METAL
253-
LOG_DEBUG("Using Metal backend");
254-
backend = ggml_backend_metal_init();
255-
if (!backend) {
256-
LOG_ERROR("Metal backend init failed");
257-
}
258-
#endif
259-
#ifdef SD_USE_VULKAN
260-
LOG_DEBUG("Using Vulkan backend");
261-
backend = ggml_backend_vk_init(main_gpu);
262-
if (!backend) {
263-
LOG_ERROR("Vulkan backend init failed");
264-
}
265-
#endif
266-
#ifdef SD_USE_SYCL
267-
LOG_DEBUG("Using SYCL backend");
268-
backend = ggml_backend_sycl_init(main_gpu);
269-
if (!backend) {
270-
LOG_ERROR("SYCL backend init failed");
271-
}
272-
#endif
273-
#ifdef SD_USE_CANN
274-
LOG_DEBUG("Using CANN backend");
275-
backend = ggml_backend_cann_init(main_gpu);
276-
if (!backend) {
277-
LOG_ERROR("CANN backend init failed");
278-
}
279-
#endif
280-
281-
if (!backend) {
282-
LOG_DEBUG("Using CPU backend");
239+
240+
std::vector<ggml_backend_dev_t> devices;
241+
242+
if (!rpc_servers.empty()) {
243+
ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC");
244+
if (!rpc_reg) {
245+
LOG_ERROR("failed to find RPC backend");
246+
return false;
247+
}
248+
249+
typedef ggml_backend_dev_t (*ggml_backend_rpc_add_device_t)(const char* endpoint);
250+
ggml_backend_rpc_add_device_t ggml_backend_rpc_add_device_fn = (ggml_backend_rpc_add_device_t)ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_device");
251+
if (!ggml_backend_rpc_add_device_fn) {
252+
LOG_ERROR("failed to find RPC device add function");
253+
return false;
254+
}
255+
256+
for (const std::string& server : rpc_servers) {
257+
ggml_backend_dev_t dev = ggml_backend_rpc_add_device_fn(server.c_str());
258+
if (dev) {
259+
devices.push_back(dev);
260+
} else {
261+
LOG_ERROR("failed to add RPC device for server '%s'", server.c_str());
262+
return false;
263+
}
264+
}
265+
}
266+
267+
// use all available devices
268+
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
269+
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
270+
switch (ggml_backend_dev_type(dev)) {
271+
case GGML_BACKEND_DEVICE_TYPE_CPU:
272+
case GGML_BACKEND_DEVICE_TYPE_ACCEL:
273+
// skip CPU backends since they are handled separately
274+
break;
275+
276+
case GGML_BACKEND_DEVICE_TYPE_GPU:
277+
devices.push_back(dev);
278+
break;
279+
}
280+
}
281+
282+
for (auto* dev : devices) {
283+
size_t free, total; // NOLINT
284+
ggml_backend_dev_memory(dev, &free, &total);
285+
LOG_INFO("using device %s (%s) - %zu MiB free", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), free / 1024 / 1024);
286+
}
287+
288+
// build GPU devices buffer list
289+
std::vector<std::pair<ggml_backend_dev_t, ggml_backend_buffer_type_t>> gpu_devices;
290+
{
291+
const bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + devices.size(), [](float x) { return x == 0.0f; });
292+
// add GPU buffer types
293+
for (size_t i = 0; i < devices.size(); ++i) {
294+
if (!all_zero && tensor_split[i] <= 0.0f) {
295+
continue;
296+
}
297+
ggml_backend_device* dev = devices[i];
298+
gpu_devices.emplace_back(dev, ggml_backend_dev_buffer_type(dev));
299+
}
300+
}
301+
302+
// initialize the backend
303+
if (gpu_devices.empty()) {
304+
// no GPU devices available
283305
backend = ggml_backend_cpu_init();
306+
} else if (gpu_devices.size() < 3) {
307+
// use the last GPU device
308+
backend = ggml_backend_dev_init(gpu_devices[gpu_devices.size() - 1].first, nullptr);
309+
} else {
310+
// use the 3rd GPU device
311+
backend = ggml_backend_dev_init(gpu_devices[2].first, nullptr);
312+
}
313+
switch (gpu_devices.size()) {
314+
case 0: {
315+
clip_backend = backend;
316+
vae_backend = backend;
317+
control_net_backend = backend;
318+
break;
319+
}
320+
case 1: {
321+
// device 0: clip, vae, control_net
322+
clip_backend = backend;
323+
if (clip_on_cpu) {
324+
LOG_INFO("CLIP: Using CPU backend");
325+
clip_backend = ggml_backend_cpu_init();
326+
}
327+
vae_backend = backend;
328+
if (vae_on_cpu) {
329+
LOG_INFO("VAE Autoencoder: Using CPU backend");
330+
vae_backend = ggml_backend_cpu_init();
331+
}
332+
control_net_backend = backend;
333+
if (control_net_cpu) {
334+
LOG_INFO("ControlNet: Using CPU backend");
335+
control_net_backend = ggml_backend_cpu_init();
336+
}
337+
break;
338+
}
339+
case 2: {
340+
// device 0: clip, vae
341+
// device 1: control_net
342+
if (clip_on_cpu) {
343+
LOG_INFO("CLIP: Using CPU backend");
344+
clip_backend = ggml_backend_cpu_init();
345+
} else {
346+
clip_backend = ggml_backend_dev_init(gpu_devices[0].first, nullptr);
347+
}
348+
if (vae_on_cpu) {
349+
LOG_INFO("VAE Autoencoder: Using CPU backend");
350+
vae_backend = ggml_backend_cpu_init();
351+
} else {
352+
vae_backend = ggml_backend_dev_init(gpu_devices[0].first, nullptr);
353+
}
354+
if (control_net_cpu) {
355+
LOG_INFO("ControlNet: Using CPU backend");
356+
control_net_backend = ggml_backend_cpu_init();
357+
} else {
358+
control_net_backend = ggml_backend_dev_init(gpu_devices[1].first, nullptr);
359+
}
360+
break;
361+
}
362+
default: {
363+
// device 0: clip
364+
// device 1: vae
365+
// device 2: control_net
366+
if (clip_on_cpu) {
367+
LOG_INFO("CLIP: Using CPU backend");
368+
clip_backend = ggml_backend_cpu_init();
369+
} else {
370+
clip_backend = ggml_backend_dev_init(gpu_devices[0].first, nullptr);
371+
}
372+
if (vae_on_cpu) {
373+
LOG_INFO("VAE Autoencoder: Using CPU backend");
374+
vae_backend = ggml_backend_cpu_init();
375+
} else {
376+
vae_backend = ggml_backend_dev_init(gpu_devices[1].first, nullptr);
377+
}
378+
if (control_net_cpu) {
379+
LOG_INFO("ControlNet: Using CPU backend");
380+
control_net_backend = ggml_backend_cpu_init();
381+
} else {
382+
control_net_backend = ggml_backend_dev_init(gpu_devices[2].first, nullptr);
383+
}
384+
}
284385
}
285386

286387
ModelLoader model_loader;
@@ -441,24 +542,19 @@ class StableDiffusionGGML {
441542
auto cc_vae = model_loader.has_prefix_tensors("first_stage_model.") && !model_loader.has_prefix_tensors("vae.");
442543

443544
if (version == VERSION_SVD) {
444-
clip_vision = std::make_shared<FrozenCLIPVisionEmbedder>(backend, model_loader.tensor_storages_types, cc_clip_l);
545+
clip_vision = std::make_shared<FrozenCLIPVisionEmbedder>(clip_backend, model_loader.tensor_storages_types, cc_clip_l);
445546
clip_vision->alloc_params_buffer();
446547
clip_vision->get_param_tensors(tensors);
447548

448549
diffusion_model = std::make_shared<UNetModel>(backend, model_loader.tensor_storages_types, version);
449550
diffusion_model->alloc_params_buffer();
450551
diffusion_model->get_param_tensors(tensors);
451552

452-
first_stage_model = std::make_shared<AutoEncoderKL>(backend, model_loader.tensor_storages_types, vae_decode_only, true, version, cc_vae);
553+
first_stage_model = std::make_shared<AutoEncoderKL>(vae_backend, model_loader.tensor_storages_types, vae_decode_only, true, version, cc_vae);
453554
LOG_DEBUG("vae_decode_only %d", vae_decode_only);
454555
first_stage_model->alloc_params_buffer();
455556
first_stage_model->get_param_tensors(tensors);
456557
} else {
457-
clip_backend = backend;
458-
if (clip_on_cpu && !ggml_backend_is_cpu(backend)) {
459-
LOG_INFO("CLIP: Using CPU backend");
460-
clip_backend = ggml_backend_cpu_init();
461-
}
462558
if (diffusion_flash_attn) {
463559
LOG_INFO("Using flash attention in the diffusion model");
464560
}
@@ -487,30 +583,17 @@ class StableDiffusionGGML {
487583
diffusion_model->get_param_tensors(tensors);
488584

489585
if (!use_tiny_autoencoder || tae_preview_only) {
490-
if (vae_on_cpu && !ggml_backend_is_cpu(backend)) {
491-
LOG_INFO("VAE Autoencoder: Using CPU backend");
492-
vae_backend = ggml_backend_cpu_init();
493-
} else {
494-
vae_backend = backend;
495-
}
496586
first_stage_model = std::make_shared<AutoEncoderKL>(vae_backend, model_loader.tensor_storages_types, vae_decode_only, false, version, cc_vae);
497587
first_stage_model->alloc_params_buffer();
498588
first_stage_model->get_param_tensors(tensors);
499589
}
500590
if (use_tiny_autoencoder) {
501-
tae_first_stage = std::make_shared<TinyAutoEncoder>(backend, model_loader.tensor_storages_types, vae_decode_only, version, cc_vae);
591+
tae_first_stage = std::make_shared<TinyAutoEncoder>(vae_backend, model_loader.tensor_storages_types, vae_decode_only, version, cc_vae);
502592
}
503593
// first_stage_model->get_param_tensors(tensors, "first_stage_model.");
504594

505595
if (control_net_path.size() > 0) {
506-
ggml_backend_t controlnet_backend = NULL;
507-
if (control_net_cpu && !ggml_backend_is_cpu(backend)) {
508-
LOG_DEBUG("ControlNet: Using CPU backend");
509-
controlnet_backend = ggml_backend_cpu_init();
510-
} else {
511-
controlnet_backend = backend;
512-
}
513-
control_net = std::make_shared<ControlNet>(controlnet_backend, model_loader.tensor_storages_types, version);
596+
control_net = std::make_shared<ControlNet>(control_net_backend, model_loader.tensor_storages_types, version);
514597
}
515598

516599
if (id_embeddings_path.find("v2") != std::string::npos) {
@@ -1418,7 +1501,8 @@ sd_ctx_t* new_sd_ctx(const char* model_path_c_str,
14181501
bool keep_vae_on_cpu,
14191502
bool diffusion_flash_attn,
14201503
bool tae_preview_only,
1421-
int main_gpu) {
1504+
const char* rpc_servers,
1505+
const float* tensor_splits) {
14221506
sd_ctx_t* sd_ctx = (sd_ctx_t*)malloc(sizeof(sd_ctx_t));
14231507
if (sd_ctx == NULL) {
14241508
return NULL;
@@ -1434,6 +1518,18 @@ sd_ctx_t* new_sd_ctx(const char* model_path_c_str,
14341518
std::string embd_path(embed_dir_c_str);
14351519
std::string id_embd_path(id_embed_dir_c_str);
14361520
std::string lora_model_dir(lora_model_dir_c_str);
1521+
std::vector<std::string> rpc_servers_vec;
1522+
if (rpc_servers != nullptr && rpc_servers[0] != '\0') {
1523+
// split the servers set them into model->rpc_servers
1524+
std::string servers(rpc_servers);
1525+
size_t pos = 0;
1526+
while ((pos = servers.find(',')) != std::string::npos) {
1527+
std::string server = servers.substr(0, pos);
1528+
rpc_servers_vec.push_back(server);
1529+
servers.erase(0, pos + 1);
1530+
}
1531+
rpc_servers_vec.push_back(servers);
1532+
}
14371533

14381534
sd_ctx->sd = new StableDiffusionGGML(n_threads,
14391535
vae_decode_only,
@@ -1463,7 +1559,8 @@ sd_ctx_t* new_sd_ctx(const char* model_path_c_str,
14631559
keep_vae_on_cpu,
14641560
diffusion_flash_attn,
14651561
tae_preview_only,
1466-
main_gpu)) {
1562+
rpc_servers_vec,
1563+
tensor_splits)) {
14671564
delete sd_ctx->sd;
14681565
sd_ctx->sd = NULL;
14691566
free(sd_ctx);

stable-diffusion.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,8 @@ SD_API sd_ctx_t* new_sd_ctx(const char* model_path,
167167
bool keep_vae_on_cpu,
168168
bool diffusion_flash_attn,
169169
bool tae_preview_only,
170-
int main_gpu = 0);
170+
const char * rpc_servers = nullptr,
171+
const float * tensor_splits = nullptr);
171172

172173
SD_API void free_sd_ctx(sd_ctx_t* sd_ctx);
173174

@@ -308,7 +309,8 @@ typedef struct upscaler_ctx_t upscaler_ctx_t;
308309

309310
SD_API upscaler_ctx_t* new_upscaler_ctx(const char* esrgan_path,
310311
int n_threads,
311-
int main_gpu = 0);
312+
const char * rpc_servers = nullptr,
313+
const float * tensor_splits = nullptr);
312314
SD_API void free_upscaler_ctx(upscaler_ctx_t* upscaler_ctx);
313315

314316
SD_API sd_image_t upscale(upscaler_ctx_t* upscaler_ctx, sd_image_t input_image, uint32_t upscale_factor);

0 commit comments

Comments
 (0)