Skip to content

Commit dd3d6b0

Browse files
committed
refactor ggmlrunner
1 parent b1716c0 commit dd3d6b0

File tree

5 files changed

+110
-54
lines changed

5 files changed

+110
-54
lines changed

common.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ class UpSampleBlock : public GGMLBlock {
5656
// x: [N, channels, h, w]
5757
auto conv = std::dynamic_pointer_cast<Conv2d>(blocks["conv"]);
5858

59-
x = ggml_upscale(ctx, x, 2); // [N, channels, h*2, w*2]
59+
x = ggml_upscale(ctx, x, 2, GGML_SCALE_MODE_NEAREST); // [N, channels, h*2, w*2]
6060
x = conv->forward(ctx, x); // [N, out_channels, h*2, w*2]
6161
return x;
6262
}

esrgan.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -130,8 +130,8 @@ class RRDBNet : public GGMLBlock {
130130
body_feat = conv_body->forward(ctx, body_feat);
131131
feat = ggml_add(ctx, feat, body_feat);
132132
// upsample
133-
feat = lrelu(ctx, conv_up1->forward(ctx, ggml_upscale(ctx, feat, 2)));
134-
feat = lrelu(ctx, conv_up2->forward(ctx, ggml_upscale(ctx, feat, 2)));
133+
feat = lrelu(ctx, conv_up1->forward(ctx, ggml_upscale(ctx, feat, 2, GGML_SCALE_MODE_NEAREST)));
134+
feat = lrelu(ctx, conv_up2->forward(ctx, ggml_upscale(ctx, feat, 2, GGML_SCALE_MODE_NEAREST)));
135135
auto out = conv_last->forward(ctx, lrelu(ctx, conv_hr->forward(ctx, feat)));
136136
return out;
137137
}

ggml

Submodule ggml updated from ff90529 to 0e07f5c

ggml_extend.hpp

Lines changed: 105 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,8 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_kronecker(ggml_context* ctx, struct g
117117
a->ne[0] * b->ne[0],
118118
a->ne[1] * b->ne[1],
119119
a->ne[2] * b->ne[2],
120-
a->ne[3] * b->ne[3]),
120+
a->ne[3] * b->ne[3],
121+
GGML_SCALE_MODE_NEAREST),
121122
b);
122123
}
123124

@@ -1068,12 +1069,13 @@ struct GGMLRunner {
10681069
struct ggml_context* params_ctx = NULL;
10691070
ggml_backend_buffer_t params_buffer = NULL;
10701071

1071-
struct ggml_context* compute_ctx = NULL;
1072-
struct ggml_gallocr* compute_allocr = NULL;
1072+
struct ggml_context* compute_ctx = NULL;
1073+
ggml_backend_sched_t compute_sched = NULL;
10731074

10741075
std::map<struct ggml_tensor*, const void*> backend_tensor_data_map;
10751076

1076-
ggml_backend_t backend = NULL;
1077+
ggml_backend_t backend = NULL;
1078+
ggml_backend_t cpu_backend = NULL;
10771079

10781080
void alloc_params_ctx() {
10791081
struct ggml_init_params params;
@@ -1094,7 +1096,7 @@ struct GGMLRunner {
10941096

10951097
void alloc_compute_ctx() {
10961098
struct ggml_init_params params;
1097-
params.mem_size = static_cast<size_t>(ggml_tensor_overhead() * MAX_GRAPH_SIZE + ggml_graph_overhead());
1099+
params.mem_size = static_cast<size_t>(ggml_tensor_overhead() * MAX_GRAPH_SIZE * 2 + ggml_graph_overhead_custom(MAX_GRAPH_SIZE, false));
10981100
params.mem_buffer = NULL;
10991101
params.no_alloc = true;
11001102

@@ -1110,54 +1112,83 @@ struct GGMLRunner {
11101112
}
11111113

11121114
bool alloc_compute_buffer(get_graph_cb_t get_graph) {
1113-
if (compute_allocr != NULL) {
1115+
if (compute_sched != NULL) {
11141116
return true;
11151117
}
11161118
reset_compute_ctx();
11171119
struct ggml_cgraph* gf = get_graph();
11181120
backend_tensor_data_map.clear();
1119-
compute_allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
11201121

1121-
if (!ggml_gallocr_reserve(compute_allocr, gf)) {
1122-
// failed to allocate the compute buffer
1123-
LOG_ERROR("%s: failed to allocate the compute buffer\n", get_desc().c_str());
1124-
free_compute_buffer();
1122+
ggml_backend_t backends_list[2];
1123+
int n_backends_for_sched = 0;
1124+
1125+
backends_list[n_backends_for_sched++] = this->backend;
1126+
if (this->cpu_backend) {
1127+
backends_list[n_backends_for_sched++] = this->cpu_backend;
1128+
}
1129+
1130+
compute_sched = ggml_backend_sched_new(backends_list, NULL, n_backends_for_sched, MAX_GRAPH_SIZE, false, false);
1131+
if (!compute_sched) {
1132+
LOG_ERROR("%s: failed to create backend scheduler\n", get_desc().c_str());
11251133
return false;
11261134
}
11271135

1128-
// compute the required memory
1129-
size_t compute_buffer_size = ggml_gallocr_get_buffer_size(compute_allocr, 0);
1130-
LOG_DEBUG("%s compute buffer size: %.2f MB(%s)",
1131-
get_desc().c_str(),
1132-
compute_buffer_size / 1024.0 / 1024.0,
1133-
ggml_backend_is_cpu(backend) ? "RAM" : "VRAM");
1136+
if (!ggml_backend_sched_reserve(compute_sched, gf)) {
1137+
LOG_ERROR("%s: failed to reserve memory with backend scheduler for graph\n", get_desc().c_str());
1138+
ggml_backend_sched_free(compute_sched);
1139+
compute_sched = NULL;
1140+
return false;
1141+
}
1142+
1143+
for (int i = 0; i < n_backends_for_sched; ++i) {
1144+
size_t buffer_size = ggml_backend_sched_get_buffer_size(compute_sched, backends_list[i]);
1145+
LOG_DEBUG("%s compute buffer size for %s: %.2f MB",
1146+
get_desc().c_str(),
1147+
ggml_backend_name(backends_list[i]),
1148+
buffer_size / 1024.0 / 1024.0);
1149+
}
11341150
return true;
11351151
}
11361152

11371153
void cpy_data_to_backend_tensor() {
11381154
for (auto& kv : backend_tensor_data_map) {
11391155
auto tensor = kv.first;
1140-
auto data = kv.second;
1156+
auto data_src = kv.second;
11411157

1142-
ggml_backend_tensor_set(tensor, data, 0, ggml_nbytes(tensor));
1158+
if (tensor->data == NULL && tensor->buffer == NULL) {
1159+
continue;
1160+
}
1161+
ggml_backend_tensor_set(tensor, data_src, 0, ggml_nbytes(tensor));
11431162
}
1144-
11451163
backend_tensor_data_map.clear();
11461164
}
11471165

11481166
public:
11491167
virtual std::string get_desc() = 0;
11501168

1151-
GGMLRunner(ggml_backend_t backend)
1152-
: backend(backend) {
1169+
GGMLRunner(ggml_backend_t backend_in)
1170+
: backend(backend_in) {
11531171
alloc_params_ctx();
1172+
if (!ggml_backend_is_cpu(this->backend)) {
1173+
this->cpu_backend = ggml_backend_cpu_init();
1174+
if (!this->cpu_backend) {
1175+
// Avoid calling pure virtual get_desc() here.
1176+
LOG_ERROR("FATAL: Failed to initialize CPU backend for fallback.");
1177+
}
1178+
} else {
1179+
this->cpu_backend = NULL;
1180+
}
11541181
}
11551182

11561183
virtual ~GGMLRunner() {
11571184
free_params_buffer();
11581185
free_compute_buffer();
11591186
free_params_ctx();
11601187
free_compute_ctx();
1188+
if (cpu_backend) {
1189+
ggml_backend_free(cpu_backend);
1190+
cpu_backend = NULL;
1191+
}
11611192
}
11621193

11631194
void reset_compute_ctx() {
@@ -1169,22 +1200,17 @@ struct GGMLRunner {
11691200
size_t num_tensors = ggml_tensor_num(params_ctx);
11701201
params_buffer = ggml_backend_alloc_ctx_tensors(params_ctx, backend);
11711202
if (params_buffer == NULL) {
1172-
LOG_ERROR("%s alloc params backend buffer failed, num_tensors = %i",
1203+
LOG_ERROR("%s alloc params backend buffer failed, num_tensors = %zu",
11731204
get_desc().c_str(),
11741205
num_tensors);
11751206
return false;
11761207
}
11771208
size_t params_buffer_size = ggml_backend_buffer_get_size(params_buffer);
1178-
LOG_DEBUG("%s params backend buffer size = % 6.2f MB(%s) (%i tensors)",
1209+
LOG_DEBUG("%s params backend buffer size = % 6.2f MB(%s) (%zu tensors)",
11791210
get_desc().c_str(),
11801211
params_buffer_size / (1024.0 * 1024.0),
11811212
ggml_backend_is_cpu(backend) ? "RAM" : "VRAM",
11821213
num_tensors);
1183-
// printf("%s params backend buffer size = % 6.2f MB(%s) (%i tensors)\n",
1184-
// get_desc().c_str(),
1185-
// params_buffer_size / (1024.0 * 1024.0),
1186-
// ggml_backend_is_cpu(backend) ? "RAM" : "VRAM",
1187-
// num_tensors);
11881214
return true;
11891215
}
11901216

@@ -1203,13 +1229,12 @@ struct GGMLRunner {
12031229
}
12041230

12051231
void free_compute_buffer() {
1206-
if (compute_allocr != NULL) {
1207-
ggml_gallocr_free(compute_allocr);
1208-
compute_allocr = NULL;
1232+
if (compute_sched != NULL) {
1233+
ggml_backend_sched_free(compute_sched);
1234+
compute_sched = NULL;
12091235
}
12101236
}
12111237

1212-
// do copy after alloc graph
12131238
void set_backend_tensor_data(struct ggml_tensor* tensor, const void* data) {
12141239
backend_tensor_data_map[tensor] = data;
12151240
}
@@ -1219,11 +1244,12 @@ struct GGMLRunner {
12191244
if (tensor == NULL) {
12201245
return NULL;
12211246
}
1222-
// it's performing a compute, check if backend isn't cpu
1223-
if (!ggml_backend_is_cpu(backend) && (tensor->buffer == NULL || ggml_backend_buffer_is_host(tensor->buffer))) {
1224-
// pass input tensors to gpu memory
1225-
auto backend_tensor = ggml_dup_tensor(compute_ctx, tensor);
1247+
bool tensor_on_host_or_unmanaged = tensor->buffer == NULL || ggml_backend_buffer_is_host(tensor->buffer);
1248+
bool is_param_tensor = false;
12261249

1250+
if (tensor_on_host_or_unmanaged && !is_param_tensor) {
1251+
auto backend_tensor = ggml_dup_tensor(compute_ctx, tensor);
1252+
ggml_set_name(backend_tensor, tensor->name);
12271253
set_backend_tensor_data(backend_tensor, tensor->data);
12281254
return backend_tensor;
12291255
} else {
@@ -1236,26 +1262,56 @@ struct GGMLRunner {
12361262
bool free_compute_buffer_immediately = true,
12371263
struct ggml_tensor** output = NULL,
12381264
struct ggml_context* output_ctx = NULL) {
1239-
alloc_compute_buffer(get_graph);
1240-
reset_compute_ctx();
1241-
struct ggml_cgraph* gf = get_graph();
1242-
GGML_ASSERT(ggml_gallocr_alloc_graph(compute_allocr, gf));
1265+
1266+
if (!alloc_compute_buffer(get_graph)) {
1267+
LOG_ERROR("%s: Failed to allocate/reserve compute buffer with scheduler.", get_desc().c_str());
1268+
return;
1269+
}
1270+
1271+
reset_compute_ctx();
1272+
struct ggml_cgraph* gf = get_graph();
1273+
1274+
GGML_ASSERT(compute_sched != NULL);
1275+
ggml_backend_sched_reset(compute_sched);
1276+
1277+
if (!ggml_backend_sched_alloc_graph(compute_sched, gf)) {
1278+
LOG_ERROR("%s: ggml_backend_sched_alloc_graph failed\n", get_desc().c_str());
1279+
return;
1280+
}
1281+
12431282
cpy_data_to_backend_tensor();
1244-
if (ggml_backend_is_cpu(backend)) {
1245-
ggml_backend_cpu_set_n_threads(backend, n_threads);
1283+
1284+
if (ggml_backend_is_cpu(this->backend)) {
1285+
ggml_backend_cpu_set_n_threads(this->backend, n_threads);
1286+
} else if (this->cpu_backend) {
1287+
ggml_backend_cpu_set_n_threads(this->cpu_backend, n_threads);
1288+
}
1289+
1290+
enum ggml_status status = ggml_backend_sched_graph_compute(compute_sched, gf);
1291+
if (status != GGML_STATUS_SUCCESS) {
1292+
LOG_ERROR("%s: ggml_backend_sched_graph_compute failed with status %d (%s)\n",
1293+
get_desc().c_str(), status, ggml_status_to_string(status));
1294+
return;
12461295
}
12471296

1248-
ggml_backend_graph_compute(backend, gf);
12491297
#ifdef GGML_PERF
1250-
ggml_graph_print(gf);
1298+
// ggml_graph_print(gf);
12511299
#endif
1252-
if (output != NULL) {
1253-
auto result = ggml_graph_node(gf, -1);
1300+
if (output != NULL && ggml_graph_n_nodes(gf) > 0) {
1301+
struct ggml_tensor* result_tensor_in_graph = ggml_graph_node(gf, ggml_graph_n_nodes(gf) - 1);
1302+
12541303
if (*output == NULL && output_ctx != NULL) {
1255-
*output = ggml_dup_tensor(output_ctx, result);
1304+
*output = ggml_dup_tensor(output_ctx, result_tensor_in_graph);
12561305
}
12571306
if (*output != NULL) {
1258-
ggml_backend_tensor_get_and_sync(backend, result, (*output)->data, 0, ggml_nbytes(*output));
1307+
ggml_backend_t result_backend = ggml_backend_sched_get_tensor_backend(compute_sched, result_tensor_in_graph);
1308+
if (result_backend == NULL) {
1309+
LOG_ERROR("%s: Could not determine backend for result tensor %s\n", get_desc().c_str(), result_tensor_in_graph->name);
1310+
} else {
1311+
ggml_backend_tensor_get_and_sync(result_backend,
1312+
result_tensor_in_graph,
1313+
(*output)->data, 0, ggml_nbytes(*output));
1314+
}
12591315
}
12601316
}
12611317

tae.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,7 @@ class TinyDecoder : public UnaryBlock {
149149
if (i == 1) {
150150
h = ggml_relu_inplace(ctx, h);
151151
} else {
152-
h = ggml_upscale(ctx, h, 2);
152+
h = ggml_upscale(ctx, h, 2, GGML_SCALE_MODE_NEAREST);
153153
}
154154
continue;
155155
}

0 commit comments

Comments
 (0)