Skip to content

Commit b5fb4b7

Browse files
committed
introduce ggml_backend_sched API first attempt
1 parent 10c6501 commit b5fb4b7

File tree

1 file changed

+104
-49
lines changed

1 file changed

+104
-49
lines changed

ggml_extend.hpp

Lines changed: 104 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -1064,12 +1064,13 @@ struct GGMLRunner {
10641064
struct ggml_context* params_ctx = NULL;
10651065
ggml_backend_buffer_t params_buffer = NULL;
10661066

1067-
struct ggml_context* compute_ctx = NULL;
1068-
struct ggml_gallocr* compute_allocr = NULL;
1067+
struct ggml_context* compute_ctx = NULL;
1068+
ggml_backend_sched_t compute_sched = NULL;
10691069

10701070
std::map<struct ggml_tensor*, const void*> backend_tensor_data_map;
10711071

1072-
ggml_backend_t backend = NULL;
1072+
ggml_backend_t backend = NULL;
1073+
ggml_backend_t cpu_backend = NULL;
10731074

10741075
void alloc_params_ctx() {
10751076
struct ggml_init_params params;
@@ -1090,7 +1091,7 @@ struct GGMLRunner {
10901091

10911092
void alloc_compute_ctx() {
10921093
struct ggml_init_params params;
1093-
params.mem_size = static_cast<size_t>(ggml_tensor_overhead() * MAX_GRAPH_SIZE + ggml_graph_overhead());
1094+
params.mem_size = static_cast<size_t>(ggml_tensor_overhead() * MAX_GRAPH_SIZE * 2 + ggml_graph_overhead_custom(MAX_GRAPH_SIZE, false));
10941095
params.mem_buffer = NULL;
10951096
params.no_alloc = true;
10961097

@@ -1106,54 +1107,83 @@ struct GGMLRunner {
11061107
}
11071108

11081109
bool alloc_compute_buffer(get_graph_cb_t get_graph) {
1109-
if (compute_allocr != NULL) {
1110+
if (compute_sched != NULL) {
11101111
return true;
11111112
}
11121113
reset_compute_ctx();
11131114
struct ggml_cgraph* gf = get_graph();
11141115
backend_tensor_data_map.clear();
1115-
compute_allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
11161116

1117-
if (!ggml_gallocr_reserve(compute_allocr, gf)) {
1118-
// failed to allocate the compute buffer
1119-
LOG_ERROR("%s: failed to allocate the compute buffer\n", get_desc().c_str());
1120-
free_compute_buffer();
1117+
ggml_backend_t backends_list[2];
1118+
int n_backends_for_sched = 0;
1119+
1120+
backends_list[n_backends_for_sched++] = this->backend;
1121+
if (this->cpu_backend) {
1122+
backends_list[n_backends_for_sched++] = this->cpu_backend;
1123+
}
1124+
1125+
compute_sched = ggml_backend_sched_new(backends_list, NULL, n_backends_for_sched, MAX_GRAPH_SIZE, false);
1126+
if (!compute_sched) {
1127+
LOG_ERROR("%s: failed to create backend scheduler\n", get_desc().c_str());
11211128
return false;
11221129
}
11231130

1124-
// compute the required memory
1125-
size_t compute_buffer_size = ggml_gallocr_get_buffer_size(compute_allocr, 0);
1126-
LOG_DEBUG("%s compute buffer size: %.2f MB(%s)",
1127-
get_desc().c_str(),
1128-
compute_buffer_size / 1024.0 / 1024.0,
1129-
ggml_backend_is_cpu(backend) ? "RAM" : "VRAM");
1131+
if (!ggml_backend_sched_reserve(compute_sched, gf)) {
1132+
LOG_ERROR("%s: failed to reserve memory with backend scheduler for graph\n", get_desc().c_str());
1133+
ggml_backend_sched_free(compute_sched);
1134+
compute_sched = NULL;
1135+
return false;
1136+
}
1137+
1138+
for (int i = 0; i < n_backends_for_sched; ++i) {
1139+
size_t buffer_size = ggml_backend_sched_get_buffer_size(compute_sched, backends_list[i]);
1140+
LOG_DEBUG("%s compute buffer size for %s: %.2f MB",
1141+
get_desc().c_str(),
1142+
ggml_backend_name(backends_list[i]),
1143+
buffer_size / 1024.0 / 1024.0);
1144+
}
11301145
return true;
11311146
}
11321147

11331148
void cpy_data_to_backend_tensor() {
11341149
for (auto& kv : backend_tensor_data_map) {
11351150
auto tensor = kv.first;
1136-
auto data = kv.second;
1151+
auto data_src = kv.second;
11371152

1138-
ggml_backend_tensor_set(tensor, data, 0, ggml_nbytes(tensor));
1153+
if (tensor->data == NULL && tensor->buffer == NULL) {
1154+
continue;
1155+
}
1156+
ggml_backend_tensor_set(tensor, data_src, 0, ggml_nbytes(tensor));
11391157
}
1140-
11411158
backend_tensor_data_map.clear();
11421159
}
11431160

11441161
public:
11451162
virtual std::string get_desc() = 0;
11461163

1147-
GGMLRunner(ggml_backend_t backend)
1148-
: backend(backend) {
1164+
GGMLRunner(ggml_backend_t backend_in)
1165+
: backend(backend_in) {
11491166
alloc_params_ctx();
1167+
if (!ggml_backend_is_cpu(this->backend)) {
1168+
this->cpu_backend = ggml_backend_cpu_init();
1169+
if (!this->cpu_backend) {
1170+
// Avoid calling pure virtual get_desc() here.
1171+
LOG_ERROR("FATAL: Failed to initialize CPU backend for fallback.");
1172+
}
1173+
} else {
1174+
this->cpu_backend = NULL;
1175+
}
11501176
}
11511177

11521178
virtual ~GGMLRunner() {
11531179
free_params_buffer();
11541180
free_compute_buffer();
11551181
free_params_ctx();
11561182
free_compute_ctx();
1183+
if (cpu_backend) {
1184+
ggml_backend_free(cpu_backend);
1185+
cpu_backend = NULL;
1186+
}
11571187
}
11581188

11591189
void reset_compute_ctx() {
@@ -1165,22 +1195,17 @@ struct GGMLRunner {
11651195
size_t num_tensors = ggml_tensor_num(params_ctx);
11661196
params_buffer = ggml_backend_alloc_ctx_tensors(params_ctx, backend);
11671197
if (params_buffer == NULL) {
1168-
LOG_ERROR("%s alloc params backend buffer failed, num_tensors = %i",
1198+
LOG_ERROR("%s alloc params backend buffer failed, num_tensors = %zu",
11691199
get_desc().c_str(),
11701200
num_tensors);
11711201
return false;
11721202
}
11731203
size_t params_buffer_size = ggml_backend_buffer_get_size(params_buffer);
1174-
LOG_DEBUG("%s params backend buffer size = % 6.2f MB(%s) (%i tensors)",
1204+
LOG_DEBUG("%s params backend buffer size = % 6.2f MB(%s) (%zu tensors)",
11751205
get_desc().c_str(),
11761206
params_buffer_size / (1024.0 * 1024.0),
11771207
ggml_backend_is_cpu(backend) ? "RAM" : "VRAM",
11781208
num_tensors);
1179-
// printf("%s params backend buffer size = % 6.2f MB(%s) (%i tensors)\n",
1180-
// get_desc().c_str(),
1181-
// params_buffer_size / (1024.0 * 1024.0),
1182-
// ggml_backend_is_cpu(backend) ? "RAM" : "VRAM",
1183-
// num_tensors);
11841209
return true;
11851210
}
11861211

@@ -1199,13 +1224,12 @@ struct GGMLRunner {
11991224
}
12001225

12011226
void free_compute_buffer() {
1202-
if (compute_allocr != NULL) {
1203-
ggml_gallocr_free(compute_allocr);
1204-
compute_allocr = NULL;
1227+
if (compute_sched != NULL) {
1228+
ggml_backend_sched_free(compute_sched);
1229+
compute_sched = NULL;
12051230
}
12061231
}
12071232

1208-
// do copy after alloc graph
12091233
void set_backend_tensor_data(struct ggml_tensor* tensor, const void* data) {
12101234
backend_tensor_data_map[tensor] = data;
12111235
}
@@ -1215,11 +1239,12 @@ struct GGMLRunner {
12151239
if (tensor == NULL) {
12161240
return NULL;
12171241
}
1218-
// it's performing a compute, check if backend isn't cpu
1219-
if (!ggml_backend_is_cpu(backend) && (tensor->buffer == NULL || ggml_backend_buffer_is_host(tensor->buffer))) {
1220-
// pass input tensors to gpu memory
1221-
auto backend_tensor = ggml_dup_tensor(compute_ctx, tensor);
1242+
bool tensor_on_host_or_unmanaged = tensor->buffer == NULL || ggml_backend_buffer_is_host(tensor->buffer);
1243+
bool is_param_tensor = false;
12221244

1245+
if (tensor_on_host_or_unmanaged && !is_param_tensor) {
1246+
auto backend_tensor = ggml_dup_tensor(compute_ctx, tensor);
1247+
ggml_set_name(backend_tensor, tensor->name);
12231248
set_backend_tensor_data(backend_tensor, tensor->data);
12241249
return backend_tensor;
12251250
} else {
@@ -1232,26 +1257,56 @@ struct GGMLRunner {
12321257
bool free_compute_buffer_immediately = true,
12331258
struct ggml_tensor** output = NULL,
12341259
struct ggml_context* output_ctx = NULL) {
1235-
alloc_compute_buffer(get_graph);
1236-
reset_compute_ctx();
1237-
struct ggml_cgraph* gf = get_graph();
1238-
GGML_ASSERT(ggml_gallocr_alloc_graph(compute_allocr, gf));
1260+
1261+
if (!alloc_compute_buffer(get_graph)) {
1262+
LOG_ERROR("%s: Failed to allocate/reserve compute buffer with scheduler.", get_desc().c_str());
1263+
return;
1264+
}
1265+
1266+
reset_compute_ctx();
1267+
struct ggml_cgraph* gf = get_graph();
1268+
1269+
GGML_ASSERT(compute_sched != NULL);
1270+
ggml_backend_sched_reset(compute_sched);
1271+
1272+
if (!ggml_backend_sched_alloc_graph(compute_sched, gf)) {
1273+
LOG_ERROR("%s: ggml_backend_sched_alloc_graph failed\n", get_desc().c_str());
1274+
return;
1275+
}
1276+
12391277
cpy_data_to_backend_tensor();
1240-
if (ggml_backend_is_cpu(backend)) {
1241-
ggml_backend_cpu_set_n_threads(backend, n_threads);
1278+
1279+
if (ggml_backend_is_cpu(this->backend)) {
1280+
ggml_backend_cpu_set_n_threads(this->backend, n_threads);
1281+
} else if (this->cpu_backend) {
1282+
ggml_backend_cpu_set_n_threads(this->cpu_backend, n_threads);
1283+
}
1284+
1285+
enum ggml_status status = ggml_backend_sched_graph_compute(compute_sched, gf);
1286+
if (status != GGML_STATUS_SUCCESS) {
1287+
LOG_ERROR("%s: ggml_backend_sched_graph_compute failed with status %d (%s)\n",
1288+
get_desc().c_str(), status, ggml_status_to_string(status));
1289+
return;
12421290
}
12431291

1244-
ggml_backend_graph_compute(backend, gf);
12451292
#ifdef GGML_PERF
1246-
ggml_graph_print(gf);
1293+
// ggml_graph_print(gf);
12471294
#endif
1248-
if (output != NULL) {
1249-
auto result = ggml_graph_node(gf, -1);
1295+
if (output != NULL && ggml_graph_n_nodes(gf) > 0) {
1296+
struct ggml_tensor* result_tensor_in_graph = ggml_graph_node(gf, ggml_graph_n_nodes(gf) - 1);
1297+
12501298
if (*output == NULL && output_ctx != NULL) {
1251-
*output = ggml_dup_tensor(output_ctx, result);
1299+
*output = ggml_dup_tensor(output_ctx, result_tensor_in_graph);
12521300
}
12531301
if (*output != NULL) {
1254-
ggml_backend_tensor_get_and_sync(backend, result, (*output)->data, 0, ggml_nbytes(*output));
1302+
ggml_backend_t result_backend = ggml_backend_sched_get_tensor_backend(compute_sched, result_tensor_in_graph);
1303+
if (result_backend == NULL) {
1304+
LOG_ERROR("%s: Could not determine backend for result tensor %s\n", get_desc().c_str(), result_tensor_in_graph->name);
1305+
} else {
1306+
ggml_backend_tensor_get_and_sync(result_backend,
1307+
result_tensor_in_graph,
1308+
(*output)->data, 0, ggml_nbytes(*output));
1309+
}
12551310
}
12561311
}
12571312

@@ -1630,4 +1685,4 @@ class MultiheadAttention : public GGMLBlock {
16301685
}
16311686
};
16321687

1633-
#endif // __GGML_EXTEND__HPP__
1688+
#endif // __GGML_EXTEND__HPP__

0 commit comments

Comments
 (0)