@@ -50,8 +50,7 @@ const char* sampling_methods_str[] = {
50
50
" iPNDM" ,
51
51
" iPNDM_v" ,
52
52
" LCM" ,
53
- " DDIM \" trailing\" "
54
- };
53
+ " DDIM \" trailing\" " };
55
54
56
55
/* ================================================== Helper Functions ================================================*/
57
56
@@ -143,8 +142,8 @@ class StableDiffusionGGML {
143
142
public:
144
143
ggml_backend_t backend = NULL ; // general backend
145
144
ggml_backend_t clip_backend = NULL ;
146
- ggml_backend_t control_net_backend = NULL ;
147
145
ggml_backend_t vae_backend = NULL ;
146
+ ggml_backend_t control_net_backend = NULL ;
148
147
ggml_type model_wtype = GGML_TYPE_COUNT;
149
148
ggml_type clip_l_wtype = GGML_TYPE_COUNT;
150
149
ggml_type clip_g_wtype = GGML_TYPE_COUNT;
@@ -232,55 +231,157 @@ class StableDiffusionGGML {
232
231
bool vae_on_cpu,
233
232
bool diffusion_flash_attn,
234
233
bool tae_preview_only,
235
- int main_gpu) {
234
+ const std::vector<std::string>& rpc_servers,
235
+ const float * tensor_split) {
236
236
use_tiny_autoencoder = taesd_path.size () > 0 ;
237
237
238
238
ggml_log_set (ggml_log_callback_default, nullptr );
239
- #ifdef SD_USE_CUDA
240
- #ifdef SD_USE_HIP
241
- LOG_DEBUG (" Using HIP backend" );
242
- #elif defined(SD_USE_MUSA)
243
- LOG_DEBUG (" Using MUSA backend" );
244
- #else
245
- LOG_DEBUG (" Using CUDA backend" );
246
- #endif
247
- backend = ggml_backend_cuda_init (main_gpu);
248
- if (!backend) {
249
- LOG_ERROR (" CUDA backend init failed" );
250
- }
251
- #endif
252
- #ifdef SD_USE_METAL
253
- LOG_DEBUG (" Using Metal backend" );
254
- backend = ggml_backend_metal_init ();
255
- if (!backend) {
256
- LOG_ERROR (" Metal backend init failed" );
257
- }
258
- #endif
259
- #ifdef SD_USE_VULKAN
260
- LOG_DEBUG (" Using Vulkan backend" );
261
- backend = ggml_backend_vk_init (main_gpu);
262
- if (!backend) {
263
- LOG_ERROR (" Vulkan backend init failed" );
264
- }
265
- #endif
266
- #ifdef SD_USE_SYCL
267
- LOG_DEBUG (" Using SYCL backend" );
268
- backend = ggml_backend_sycl_init (main_gpu);
269
- if (!backend) {
270
- LOG_ERROR (" SYCL backend init failed" );
271
- }
272
- #endif
273
- #ifdef SD_USE_CANN
274
- LOG_DEBUG (" Using CANN backend" );
275
- backend = ggml_backend_cann_init (main_gpu);
276
- if (!backend) {
277
- LOG_ERROR (" CANN backend init failed" );
278
- }
279
- #endif
280
-
281
- if (!backend) {
282
- LOG_DEBUG (" Using CPU backend" );
239
+
240
+ std::vector<ggml_backend_dev_t > devices;
241
+
242
+ if (!rpc_servers.empty ()) {
243
+ ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name (" RPC" );
244
+ if (!rpc_reg) {
245
+ LOG_ERROR (" failed to find RPC backend" );
246
+ return false ;
247
+ }
248
+
249
+ typedef ggml_backend_dev_t (*ggml_backend_rpc_add_device_t )(const char * endpoint);
250
+ ggml_backend_rpc_add_device_t ggml_backend_rpc_add_device_fn = (ggml_backend_rpc_add_device_t )ggml_backend_reg_get_proc_address (rpc_reg, " ggml_backend_rpc_add_device" );
251
+ if (!ggml_backend_rpc_add_device_fn) {
252
+ LOG_ERROR (" failed to find RPC device add function" );
253
+ return false ;
254
+ }
255
+
256
+ for (const std::string& server : rpc_servers) {
257
+ ggml_backend_dev_t dev = ggml_backend_rpc_add_device_fn (server.c_str ());
258
+ if (dev) {
259
+ devices.push_back (dev);
260
+ } else {
261
+ LOG_ERROR (" failed to add RPC device for server '%s'" , server.c_str ());
262
+ return false ;
263
+ }
264
+ }
265
+ }
266
+
267
+ // use all available devices
268
+ for (size_t i = 0 ; i < ggml_backend_dev_count (); ++i) {
269
+ ggml_backend_dev_t dev = ggml_backend_dev_get (i);
270
+ switch (ggml_backend_dev_type (dev)) {
271
+ case GGML_BACKEND_DEVICE_TYPE_CPU:
272
+ case GGML_BACKEND_DEVICE_TYPE_ACCEL:
273
+ // skip CPU backends since they are handled separately
274
+ break ;
275
+
276
+ case GGML_BACKEND_DEVICE_TYPE_GPU:
277
+ devices.push_back (dev);
278
+ break ;
279
+ }
280
+ }
281
+
282
+ for (auto * dev : devices) {
283
+ size_t free, total; // NOLINT
284
+ ggml_backend_dev_memory (dev, &free, &total);
285
+ LOG_INFO (" using device %s (%s) - %zu MiB free" , ggml_backend_dev_name (dev), ggml_backend_dev_description (dev), free / 1024 / 1024 );
286
+ }
287
+
288
+ // build GPU devices buffer list
289
+ std::vector<std::pair<ggml_backend_dev_t , ggml_backend_buffer_type_t >> gpu_devices;
290
+ {
291
+ const bool all_zero = tensor_split == nullptr || std::all_of (tensor_split, tensor_split + devices.size (), [](float x) { return x == 0 .0f ; });
292
+ // add GPU buffer types
293
+ for (size_t i = 0 ; i < devices.size (); ++i) {
294
+ if (!all_zero && tensor_split[i] <= 0 .0f ) {
295
+ continue ;
296
+ }
297
+ ggml_backend_device* dev = devices[i];
298
+ gpu_devices.emplace_back (dev, ggml_backend_dev_buffer_type (dev));
299
+ }
300
+ }
301
+
302
+ // initialize the backend
303
+ if (gpu_devices.empty ()) {
304
+ // no GPU devices available
283
305
backend = ggml_backend_cpu_init ();
306
+ } else if (gpu_devices.size () < 3 ) {
307
+ // use the last GPU device
308
+ backend = ggml_backend_dev_init (gpu_devices[gpu_devices.size () - 1 ].first , nullptr );
309
+ } else {
310
+ // use the 3rd GPU device
311
+ backend = ggml_backend_dev_init (gpu_devices[2 ].first , nullptr );
312
+ }
313
+ switch (gpu_devices.size ()) {
314
+ case 0 : {
315
+ clip_backend = backend;
316
+ vae_backend = backend;
317
+ control_net_backend = backend;
318
+ break ;
319
+ }
320
+ case 1 : {
321
+ // device 0: clip, vae, control_net
322
+ clip_backend = backend;
323
+ if (clip_on_cpu) {
324
+ LOG_INFO (" CLIP: Using CPU backend" );
325
+ clip_backend = ggml_backend_cpu_init ();
326
+ }
327
+ vae_backend = backend;
328
+ if (vae_on_cpu) {
329
+ LOG_INFO (" VAE Autoencoder: Using CPU backend" );
330
+ vae_backend = ggml_backend_cpu_init ();
331
+ }
332
+ control_net_backend = backend;
333
+ if (control_net_cpu) {
334
+ LOG_INFO (" ControlNet: Using CPU backend" );
335
+ control_net_backend = ggml_backend_cpu_init ();
336
+ }
337
+ break ;
338
+ }
339
+ case 2 : {
340
+ // device 0: clip, vae
341
+ // device 1: control_net
342
+ if (clip_on_cpu) {
343
+ LOG_INFO (" CLIP: Using CPU backend" );
344
+ clip_backend = ggml_backend_cpu_init ();
345
+ } else {
346
+ clip_backend = ggml_backend_dev_init (gpu_devices[0 ].first , nullptr );
347
+ }
348
+ if (vae_on_cpu) {
349
+ LOG_INFO (" VAE Autoencoder: Using CPU backend" );
350
+ vae_backend = ggml_backend_cpu_init ();
351
+ } else {
352
+ vae_backend = ggml_backend_dev_init (gpu_devices[0 ].first , nullptr );
353
+ }
354
+ if (control_net_cpu) {
355
+ LOG_INFO (" ControlNet: Using CPU backend" );
356
+ control_net_backend = ggml_backend_cpu_init ();
357
+ } else {
358
+ control_net_backend = ggml_backend_dev_init (gpu_devices[1 ].first , nullptr );
359
+ }
360
+ break ;
361
+ }
362
+ default : {
363
+ // device 0: clip
364
+ // device 1: vae
365
+ // device 2: control_net
366
+ if (clip_on_cpu) {
367
+ LOG_INFO (" CLIP: Using CPU backend" );
368
+ clip_backend = ggml_backend_cpu_init ();
369
+ } else {
370
+ clip_backend = ggml_backend_dev_init (gpu_devices[0 ].first , nullptr );
371
+ }
372
+ if (vae_on_cpu) {
373
+ LOG_INFO (" VAE Autoencoder: Using CPU backend" );
374
+ vae_backend = ggml_backend_cpu_init ();
375
+ } else {
376
+ vae_backend = ggml_backend_dev_init (gpu_devices[1 ].first , nullptr );
377
+ }
378
+ if (control_net_cpu) {
379
+ LOG_INFO (" ControlNet: Using CPU backend" );
380
+ control_net_backend = ggml_backend_cpu_init ();
381
+ } else {
382
+ control_net_backend = ggml_backend_dev_init (gpu_devices[2 ].first , nullptr );
383
+ }
384
+ }
284
385
}
285
386
286
387
ModelLoader model_loader;
@@ -441,24 +542,19 @@ class StableDiffusionGGML {
441
542
auto cc_vae = model_loader.has_prefix_tensors (" first_stage_model." ) && !model_loader.has_prefix_tensors (" vae." );
442
543
443
544
if (version == VERSION_SVD) {
444
- clip_vision = std::make_shared<FrozenCLIPVisionEmbedder>(backend , model_loader.tensor_storages_types , cc_clip_l);
545
+ clip_vision = std::make_shared<FrozenCLIPVisionEmbedder>(clip_backend , model_loader.tensor_storages_types , cc_clip_l);
445
546
clip_vision->alloc_params_buffer ();
446
547
clip_vision->get_param_tensors (tensors);
447
548
448
549
diffusion_model = std::make_shared<UNetModel>(backend, model_loader.tensor_storages_types , version);
449
550
diffusion_model->alloc_params_buffer ();
450
551
diffusion_model->get_param_tensors (tensors);
451
552
452
- first_stage_model = std::make_shared<AutoEncoderKL>(backend , model_loader.tensor_storages_types , vae_decode_only, true , version, cc_vae);
553
+ first_stage_model = std::make_shared<AutoEncoderKL>(vae_backend , model_loader.tensor_storages_types , vae_decode_only, true , version, cc_vae);
453
554
LOG_DEBUG (" vae_decode_only %d" , vae_decode_only);
454
555
first_stage_model->alloc_params_buffer ();
455
556
first_stage_model->get_param_tensors (tensors);
456
557
} else {
457
- clip_backend = backend;
458
- if (clip_on_cpu && !ggml_backend_is_cpu (backend)) {
459
- LOG_INFO (" CLIP: Using CPU backend" );
460
- clip_backend = ggml_backend_cpu_init ();
461
- }
462
558
if (diffusion_flash_attn) {
463
559
LOG_INFO (" Using flash attention in the diffusion model" );
464
560
}
@@ -487,30 +583,17 @@ class StableDiffusionGGML {
487
583
diffusion_model->get_param_tensors (tensors);
488
584
489
585
if (!use_tiny_autoencoder || tae_preview_only) {
490
- if (vae_on_cpu && !ggml_backend_is_cpu (backend)) {
491
- LOG_INFO (" VAE Autoencoder: Using CPU backend" );
492
- vae_backend = ggml_backend_cpu_init ();
493
- } else {
494
- vae_backend = backend;
495
- }
496
586
first_stage_model = std::make_shared<AutoEncoderKL>(vae_backend, model_loader.tensor_storages_types , vae_decode_only, false , version, cc_vae);
497
587
first_stage_model->alloc_params_buffer ();
498
588
first_stage_model->get_param_tensors (tensors);
499
589
}
500
590
if (use_tiny_autoencoder) {
501
- tae_first_stage = std::make_shared<TinyAutoEncoder>(backend , model_loader.tensor_storages_types , vae_decode_only, version, cc_vae);
591
+ tae_first_stage = std::make_shared<TinyAutoEncoder>(vae_backend , model_loader.tensor_storages_types , vae_decode_only, version, cc_vae);
502
592
}
503
593
// first_stage_model->get_param_tensors(tensors, "first_stage_model.");
504
594
505
595
if (control_net_path.size () > 0 ) {
506
- ggml_backend_t controlnet_backend = NULL ;
507
- if (control_net_cpu && !ggml_backend_is_cpu (backend)) {
508
- LOG_DEBUG (" ControlNet: Using CPU backend" );
509
- controlnet_backend = ggml_backend_cpu_init ();
510
- } else {
511
- controlnet_backend = backend;
512
- }
513
- control_net = std::make_shared<ControlNet>(controlnet_backend, model_loader.tensor_storages_types , version);
596
+ control_net = std::make_shared<ControlNet>(control_net_backend, model_loader.tensor_storages_types , version);
514
597
}
515
598
516
599
if (id_embeddings_path.find (" v2" ) != std::string::npos) {
@@ -1418,7 +1501,8 @@ sd_ctx_t* new_sd_ctx(const char* model_path_c_str,
1418
1501
bool keep_vae_on_cpu,
1419
1502
bool diffusion_flash_attn,
1420
1503
bool tae_preview_only,
1421
- int main_gpu) {
1504
+ const char * rpc_servers,
1505
+ const float * tensor_splits) {
1422
1506
sd_ctx_t * sd_ctx = (sd_ctx_t *)malloc (sizeof (sd_ctx_t ));
1423
1507
if (sd_ctx == NULL ) {
1424
1508
return NULL ;
@@ -1434,6 +1518,18 @@ sd_ctx_t* new_sd_ctx(const char* model_path_c_str,
1434
1518
std::string embd_path (embed_dir_c_str);
1435
1519
std::string id_embd_path (id_embed_dir_c_str);
1436
1520
std::string lora_model_dir (lora_model_dir_c_str);
1521
+ std::vector<std::string> rpc_servers_vec;
1522
+ if (rpc_servers != nullptr && rpc_servers[0 ] != ' \0 ' ) {
1523
+ // split the servers set them into model->rpc_servers
1524
+ std::string servers (rpc_servers);
1525
+ size_t pos = 0 ;
1526
+ while ((pos = servers.find (' ,' )) != std::string::npos) {
1527
+ std::string server = servers.substr (0 , pos);
1528
+ rpc_servers_vec.push_back (server);
1529
+ servers.erase (0 , pos + 1 );
1530
+ }
1531
+ rpc_servers_vec.push_back (servers);
1532
+ }
1437
1533
1438
1534
sd_ctx->sd = new StableDiffusionGGML (n_threads,
1439
1535
vae_decode_only,
@@ -1463,7 +1559,8 @@ sd_ctx_t* new_sd_ctx(const char* model_path_c_str,
1463
1559
keep_vae_on_cpu,
1464
1560
diffusion_flash_attn,
1465
1561
tae_preview_only,
1466
- main_gpu)) {
1562
+ rpc_servers_vec,
1563
+ tensor_splits)) {
1467
1564
delete sd_ctx->sd ;
1468
1565
sd_ctx->sd = NULL ;
1469
1566
free (sd_ctx);
0 commit comments