@@ -1064,12 +1064,13 @@ struct GGMLRunner {
1064
1064
struct ggml_context * params_ctx = NULL ;
1065
1065
ggml_backend_buffer_t params_buffer = NULL ;
1066
1066
1067
- struct ggml_context * compute_ctx = NULL ;
1068
- struct ggml_gallocr * compute_allocr = NULL ;
1067
+ struct ggml_context * compute_ctx = NULL ;
1068
+ ggml_backend_sched_t compute_sched = NULL ;
1069
1069
1070
1070
std::map<struct ggml_tensor *, const void *> backend_tensor_data_map;
1071
1071
1072
- ggml_backend_t backend = NULL ;
1072
+ ggml_backend_t backend = NULL ;
1073
+ ggml_backend_t cpu_backend = NULL ;
1073
1074
1074
1075
void alloc_params_ctx () {
1075
1076
struct ggml_init_params params;
@@ -1090,7 +1091,7 @@ struct GGMLRunner {
1090
1091
1091
1092
void alloc_compute_ctx () {
1092
1093
struct ggml_init_params params;
1093
- params.mem_size = static_cast <size_t >(ggml_tensor_overhead () * MAX_GRAPH_SIZE + ggml_graph_overhead ( ));
1094
+ params.mem_size = static_cast <size_t >(ggml_tensor_overhead () * MAX_GRAPH_SIZE * 2 + ggml_graph_overhead_custom (MAX_GRAPH_SIZE, false ));
1094
1095
params.mem_buffer = NULL ;
1095
1096
params.no_alloc = true ;
1096
1097
@@ -1106,54 +1107,83 @@ struct GGMLRunner {
1106
1107
}
1107
1108
1108
1109
bool alloc_compute_buffer (get_graph_cb_t get_graph) {
1109
- if (compute_allocr != NULL ) {
1110
+ if (compute_sched != NULL ) {
1110
1111
return true ;
1111
1112
}
1112
1113
reset_compute_ctx ();
1113
1114
struct ggml_cgraph * gf = get_graph ();
1114
1115
backend_tensor_data_map.clear ();
1115
- compute_allocr = ggml_gallocr_new (ggml_backend_get_default_buffer_type (backend));
1116
1116
1117
- if (!ggml_gallocr_reserve (compute_allocr, gf)) {
1118
- // failed to allocate the compute buffer
1119
- LOG_ERROR (" %s: failed to allocate the compute buffer\n " , get_desc ().c_str ());
1120
- free_compute_buffer ();
1117
+ ggml_backend_t backends_list[2 ];
1118
+ int n_backends_for_sched = 0 ;
1119
+
1120
+ backends_list[n_backends_for_sched++] = this ->backend ;
1121
+ if (this ->cpu_backend ) {
1122
+ backends_list[n_backends_for_sched++] = this ->cpu_backend ;
1123
+ }
1124
+
1125
+ compute_sched = ggml_backend_sched_new (backends_list, NULL , n_backends_for_sched, MAX_GRAPH_SIZE, false );
1126
+ if (!compute_sched) {
1127
+ LOG_ERROR (" %s: failed to create backend scheduler\n " , get_desc ().c_str ());
1121
1128
return false ;
1122
1129
}
1123
1130
1124
- // compute the required memory
1125
- size_t compute_buffer_size = ggml_gallocr_get_buffer_size (compute_allocr, 0 );
1126
- LOG_DEBUG (" %s compute buffer size: %.2f MB(%s)" ,
1127
- get_desc ().c_str (),
1128
- compute_buffer_size / 1024.0 / 1024.0 ,
1129
- ggml_backend_is_cpu (backend) ? " RAM" : " VRAM" );
1131
+ if (!ggml_backend_sched_reserve (compute_sched, gf)) {
1132
+ LOG_ERROR (" %s: failed to reserve memory with backend scheduler for graph\n " , get_desc ().c_str ());
1133
+ ggml_backend_sched_free (compute_sched);
1134
+ compute_sched = NULL ;
1135
+ return false ;
1136
+ }
1137
+
1138
+ for (int i = 0 ; i < n_backends_for_sched; ++i) {
1139
+ size_t buffer_size = ggml_backend_sched_get_buffer_size (compute_sched, backends_list[i]);
1140
+ LOG_DEBUG (" %s compute buffer size for %s: %.2f MB" ,
1141
+ get_desc ().c_str (),
1142
+ ggml_backend_name (backends_list[i]),
1143
+ buffer_size / 1024.0 / 1024.0 );
1144
+ }
1130
1145
return true ;
1131
1146
}
1132
1147
1133
1148
void cpy_data_to_backend_tensor () {
1134
1149
for (auto & kv : backend_tensor_data_map) {
1135
1150
auto tensor = kv.first ;
1136
- auto data = kv.second ;
1151
+ auto data_src = kv.second ;
1137
1152
1138
- ggml_backend_tensor_set (tensor, data, 0 , ggml_nbytes (tensor));
1153
+ if (tensor->data == NULL && tensor->buffer == NULL ) {
1154
+ continue ;
1155
+ }
1156
+ ggml_backend_tensor_set (tensor, data_src, 0 , ggml_nbytes (tensor));
1139
1157
}
1140
-
1141
1158
backend_tensor_data_map.clear ();
1142
1159
}
1143
1160
1144
1161
public:
1145
1162
virtual std::string get_desc () = 0;
1146
1163
1147
- GGMLRunner (ggml_backend_t backend )
1148
- : backend(backend ) {
1164
+ GGMLRunner (ggml_backend_t backend_in )
1165
+ : backend(backend_in ) {
1149
1166
alloc_params_ctx ();
1167
+ if (!ggml_backend_is_cpu (this ->backend )) {
1168
+ this ->cpu_backend = ggml_backend_cpu_init ();
1169
+ if (!this ->cpu_backend ) {
1170
+ // Avoid calling pure virtual get_desc() here.
1171
+ LOG_ERROR (" FATAL: Failed to initialize CPU backend for fallback." );
1172
+ }
1173
+ } else {
1174
+ this ->cpu_backend = NULL ;
1175
+ }
1150
1176
}
1151
1177
1152
1178
virtual ~GGMLRunner () {
1153
1179
free_params_buffer ();
1154
1180
free_compute_buffer ();
1155
1181
free_params_ctx ();
1156
1182
free_compute_ctx ();
1183
+ if (cpu_backend) {
1184
+ ggml_backend_free (cpu_backend);
1185
+ cpu_backend = NULL ;
1186
+ }
1157
1187
}
1158
1188
1159
1189
void reset_compute_ctx () {
@@ -1165,22 +1195,17 @@ struct GGMLRunner {
1165
1195
size_t num_tensors = ggml_tensor_num (params_ctx);
1166
1196
params_buffer = ggml_backend_alloc_ctx_tensors (params_ctx, backend);
1167
1197
if (params_buffer == NULL ) {
1168
- LOG_ERROR (" %s alloc params backend buffer failed, num_tensors = %i " ,
1198
+ LOG_ERROR (" %s alloc params backend buffer failed, num_tensors = %zu " ,
1169
1199
get_desc ().c_str (),
1170
1200
num_tensors);
1171
1201
return false ;
1172
1202
}
1173
1203
size_t params_buffer_size = ggml_backend_buffer_get_size (params_buffer);
1174
- LOG_DEBUG (" %s params backend buffer size = % 6.2f MB(%s) (%i tensors)" ,
1204
+ LOG_DEBUG (" %s params backend buffer size = % 6.2f MB(%s) (%zu tensors)" ,
1175
1205
get_desc ().c_str (),
1176
1206
params_buffer_size / (1024.0 * 1024.0 ),
1177
1207
ggml_backend_is_cpu (backend) ? " RAM" : " VRAM" ,
1178
1208
num_tensors);
1179
- // printf("%s params backend buffer size = % 6.2f MB(%s) (%i tensors)\n",
1180
- // get_desc().c_str(),
1181
- // params_buffer_size / (1024.0 * 1024.0),
1182
- // ggml_backend_is_cpu(backend) ? "RAM" : "VRAM",
1183
- // num_tensors);
1184
1209
return true ;
1185
1210
}
1186
1211
@@ -1199,13 +1224,12 @@ struct GGMLRunner {
1199
1224
}
1200
1225
1201
1226
void free_compute_buffer () {
1202
- if (compute_allocr != NULL ) {
1203
- ggml_gallocr_free (compute_allocr );
1204
- compute_allocr = NULL ;
1227
+ if (compute_sched != NULL ) {
1228
+ ggml_backend_sched_free (compute_sched );
1229
+ compute_sched = NULL ;
1205
1230
}
1206
1231
}
1207
1232
1208
- // do copy after alloc graph
1209
1233
void set_backend_tensor_data (struct ggml_tensor * tensor, const void * data) {
1210
1234
backend_tensor_data_map[tensor] = data;
1211
1235
}
@@ -1215,11 +1239,12 @@ struct GGMLRunner {
1215
1239
if (tensor == NULL ) {
1216
1240
return NULL ;
1217
1241
}
1218
- // it's performing a compute, check if backend isn't cpu
1219
- if (!ggml_backend_is_cpu (backend) && (tensor->buffer == NULL || ggml_backend_buffer_is_host (tensor->buffer ))) {
1220
- // pass input tensors to gpu memory
1221
- auto backend_tensor = ggml_dup_tensor (compute_ctx, tensor);
1242
+ bool tensor_on_host_or_unmanaged = tensor->buffer == NULL || ggml_backend_buffer_is_host (tensor->buffer );
1243
+ bool is_param_tensor = false ;
1222
1244
1245
+ if (tensor_on_host_or_unmanaged && !is_param_tensor) {
1246
+ auto backend_tensor = ggml_dup_tensor (compute_ctx, tensor);
1247
+ ggml_set_name (backend_tensor, tensor->name );
1223
1248
set_backend_tensor_data (backend_tensor, tensor->data );
1224
1249
return backend_tensor;
1225
1250
} else {
@@ -1232,26 +1257,56 @@ struct GGMLRunner {
1232
1257
bool free_compute_buffer_immediately = true ,
1233
1258
struct ggml_tensor ** output = NULL ,
1234
1259
struct ggml_context * output_ctx = NULL ) {
1235
- alloc_compute_buffer (get_graph);
1236
- reset_compute_ctx ();
1237
- struct ggml_cgraph * gf = get_graph ();
1238
- GGML_ASSERT (ggml_gallocr_alloc_graph (compute_allocr, gf));
1260
+
1261
+ if (!alloc_compute_buffer (get_graph)) {
1262
+ LOG_ERROR (" %s: Failed to allocate/reserve compute buffer with scheduler." , get_desc ().c_str ());
1263
+ return ;
1264
+ }
1265
+
1266
+ reset_compute_ctx ();
1267
+ struct ggml_cgraph * gf = get_graph ();
1268
+
1269
+ GGML_ASSERT (compute_sched != NULL );
1270
+ ggml_backend_sched_reset (compute_sched);
1271
+
1272
+ if (!ggml_backend_sched_alloc_graph (compute_sched, gf)) {
1273
+ LOG_ERROR (" %s: ggml_backend_sched_alloc_graph failed\n " , get_desc ().c_str ());
1274
+ return ;
1275
+ }
1276
+
1239
1277
cpy_data_to_backend_tensor ();
1240
- if (ggml_backend_is_cpu (backend)) {
1241
- ggml_backend_cpu_set_n_threads (backend, n_threads);
1278
+
1279
+ if (ggml_backend_is_cpu (this ->backend )) {
1280
+ ggml_backend_cpu_set_n_threads (this ->backend , n_threads);
1281
+ } else if (this ->cpu_backend ) {
1282
+ ggml_backend_cpu_set_n_threads (this ->cpu_backend , n_threads);
1283
+ }
1284
+
1285
+ enum ggml_status status = ggml_backend_sched_graph_compute (compute_sched, gf);
1286
+ if (status != GGML_STATUS_SUCCESS) {
1287
+ LOG_ERROR (" %s: ggml_backend_sched_graph_compute failed with status %d (%s)\n " ,
1288
+ get_desc ().c_str (), status, ggml_status_to_string (status));
1289
+ return ;
1242
1290
}
1243
1291
1244
- ggml_backend_graph_compute (backend, gf);
1245
1292
#ifdef GGML_PERF
1246
- ggml_graph_print (gf);
1293
+ // ggml_graph_print(gf);
1247
1294
#endif
1248
- if (output != NULL ) {
1249
- auto result = ggml_graph_node (gf, -1 );
1295
+ if (output != NULL && ggml_graph_n_nodes (gf) > 0 ) {
1296
+ struct ggml_tensor * result_tensor_in_graph = ggml_graph_node (gf, ggml_graph_n_nodes (gf) - 1 );
1297
+
1250
1298
if (*output == NULL && output_ctx != NULL ) {
1251
- *output = ggml_dup_tensor (output_ctx, result);
1299
+ *output = ggml_dup_tensor (output_ctx, result_tensor_in_graph);
1252
1300
}
1253
1301
if (*output != NULL ) {
1254
- ggml_backend_tensor_get_and_sync (backend, result, (*output)->data , 0 , ggml_nbytes (*output));
1302
+ ggml_backend_t result_backend = ggml_backend_sched_get_tensor_backend (compute_sched, result_tensor_in_graph);
1303
+ if (result_backend == NULL ) {
1304
+ LOG_ERROR (" %s: Could not determine backend for result tensor %s\n " , get_desc ().c_str (), result_tensor_in_graph->name );
1305
+ } else {
1306
+ ggml_backend_tensor_get_and_sync (result_backend,
1307
+ result_tensor_in_graph,
1308
+ (*output)->data , 0 , ggml_nbytes (*output));
1309
+ }
1255
1310
}
1256
1311
}
1257
1312
@@ -1630,4 +1685,4 @@ class MultiheadAttention : public GGMLBlock {
1630
1685
}
1631
1686
};
1632
1687
1633
- #endif // __GGML_EXTEND__HPP__
1688
+ #endif // __GGML_EXTEND__HPP__
0 commit comments