@@ -117,7 +117,8 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_kronecker(ggml_context* ctx, struct g
117
117
a->ne [0 ] * b->ne [0 ],
118
118
a->ne [1 ] * b->ne [1 ],
119
119
a->ne [2 ] * b->ne [2 ],
120
- a->ne [3 ] * b->ne [3 ]),
120
+ a->ne [3 ] * b->ne [3 ],
121
+ GGML_SCALE_MODE_NEAREST),
121
122
b);
122
123
}
123
124
@@ -1068,12 +1069,13 @@ struct GGMLRunner {
1068
1069
struct ggml_context * params_ctx = NULL ;
1069
1070
ggml_backend_buffer_t params_buffer = NULL ;
1070
1071
1071
- struct ggml_context * compute_ctx = NULL ;
1072
- struct ggml_gallocr * compute_allocr = NULL ;
1072
+ struct ggml_context * compute_ctx = NULL ;
1073
+ ggml_backend_sched_t compute_sched = NULL ;
1073
1074
1074
1075
std::map<struct ggml_tensor *, const void *> backend_tensor_data_map;
1075
1076
1076
- ggml_backend_t backend = NULL ;
1077
+ ggml_backend_t backend = NULL ;
1078
+ ggml_backend_t cpu_backend = NULL ;
1077
1079
1078
1080
void alloc_params_ctx () {
1079
1081
struct ggml_init_params params;
@@ -1094,7 +1096,7 @@ struct GGMLRunner {
1094
1096
1095
1097
void alloc_compute_ctx () {
1096
1098
struct ggml_init_params params;
1097
- params.mem_size = static_cast <size_t >(ggml_tensor_overhead () * MAX_GRAPH_SIZE + ggml_graph_overhead ( ));
1099
+ params.mem_size = static_cast <size_t >(ggml_tensor_overhead () * MAX_GRAPH_SIZE * 2 + ggml_graph_overhead_custom (MAX_GRAPH_SIZE, false ));
1098
1100
params.mem_buffer = NULL ;
1099
1101
params.no_alloc = true ;
1100
1102
@@ -1110,54 +1112,83 @@ struct GGMLRunner {
1110
1112
}
1111
1113
1112
1114
bool alloc_compute_buffer (get_graph_cb_t get_graph) {
1113
- if (compute_allocr != NULL ) {
1115
+ if (compute_sched != NULL ) {
1114
1116
return true ;
1115
1117
}
1116
1118
reset_compute_ctx ();
1117
1119
struct ggml_cgraph * gf = get_graph ();
1118
1120
backend_tensor_data_map.clear ();
1119
- compute_allocr = ggml_gallocr_new (ggml_backend_get_default_buffer_type (backend));
1120
1121
1121
- if (!ggml_gallocr_reserve (compute_allocr, gf)) {
1122
- // failed to allocate the compute buffer
1123
- LOG_ERROR (" %s: failed to allocate the compute buffer\n " , get_desc ().c_str ());
1124
- free_compute_buffer ();
1122
+ ggml_backend_t backends_list[2 ];
1123
+ int n_backends_for_sched = 0 ;
1124
+
1125
+ backends_list[n_backends_for_sched++] = this ->backend ;
1126
+ if (this ->cpu_backend ) {
1127
+ backends_list[n_backends_for_sched++] = this ->cpu_backend ;
1128
+ }
1129
+
1130
+ compute_sched = ggml_backend_sched_new (backends_list, NULL , n_backends_for_sched, MAX_GRAPH_SIZE, false , false );
1131
+ if (!compute_sched) {
1132
+ LOG_ERROR (" %s: failed to create backend scheduler\n " , get_desc ().c_str ());
1125
1133
return false ;
1126
1134
}
1127
1135
1128
- // compute the required memory
1129
- size_t compute_buffer_size = ggml_gallocr_get_buffer_size (compute_allocr, 0 );
1130
- LOG_DEBUG (" %s compute buffer size: %.2f MB(%s)" ,
1131
- get_desc ().c_str (),
1132
- compute_buffer_size / 1024.0 / 1024.0 ,
1133
- ggml_backend_is_cpu (backend) ? " RAM" : " VRAM" );
1136
+ if (!ggml_backend_sched_reserve (compute_sched, gf)) {
1137
+ LOG_ERROR (" %s: failed to reserve memory with backend scheduler for graph\n " , get_desc ().c_str ());
1138
+ ggml_backend_sched_free (compute_sched);
1139
+ compute_sched = NULL ;
1140
+ return false ;
1141
+ }
1142
+
1143
+ for (int i = 0 ; i < n_backends_for_sched; ++i) {
1144
+ size_t buffer_size = ggml_backend_sched_get_buffer_size (compute_sched, backends_list[i]);
1145
+ LOG_DEBUG (" %s compute buffer size for %s: %.2f MB" ,
1146
+ get_desc ().c_str (),
1147
+ ggml_backend_name (backends_list[i]),
1148
+ buffer_size / 1024.0 / 1024.0 );
1149
+ }
1134
1150
return true ;
1135
1151
}
1136
1152
1137
1153
void cpy_data_to_backend_tensor () {
1138
1154
for (auto & kv : backend_tensor_data_map) {
1139
1155
auto tensor = kv.first ;
1140
- auto data = kv.second ;
1156
+ auto data_src = kv.second ;
1141
1157
1142
- ggml_backend_tensor_set (tensor, data, 0 , ggml_nbytes (tensor));
1158
+ if (tensor->data == NULL && tensor->buffer == NULL ) {
1159
+ continue ;
1160
+ }
1161
+ ggml_backend_tensor_set (tensor, data_src, 0 , ggml_nbytes (tensor));
1143
1162
}
1144
-
1145
1163
backend_tensor_data_map.clear ();
1146
1164
}
1147
1165
1148
1166
public:
1149
1167
virtual std::string get_desc () = 0;
1150
1168
1151
- GGMLRunner (ggml_backend_t backend )
1152
- : backend(backend ) {
1169
+ GGMLRunner (ggml_backend_t backend_in )
1170
+ : backend(backend_in ) {
1153
1171
alloc_params_ctx ();
1172
+ if (!ggml_backend_is_cpu (this ->backend )) {
1173
+ this ->cpu_backend = ggml_backend_cpu_init ();
1174
+ if (!this ->cpu_backend ) {
1175
+ // Avoid calling pure virtual get_desc() here.
1176
+ LOG_ERROR (" FATAL: Failed to initialize CPU backend for fallback." );
1177
+ }
1178
+ } else {
1179
+ this ->cpu_backend = NULL ;
1180
+ }
1154
1181
}
1155
1182
1156
1183
virtual ~GGMLRunner () {
1157
1184
free_params_buffer ();
1158
1185
free_compute_buffer ();
1159
1186
free_params_ctx ();
1160
1187
free_compute_ctx ();
1188
+ if (cpu_backend) {
1189
+ ggml_backend_free (cpu_backend);
1190
+ cpu_backend = NULL ;
1191
+ }
1161
1192
}
1162
1193
1163
1194
void reset_compute_ctx () {
@@ -1169,22 +1200,17 @@ struct GGMLRunner {
1169
1200
size_t num_tensors = ggml_tensor_num (params_ctx);
1170
1201
params_buffer = ggml_backend_alloc_ctx_tensors (params_ctx, backend);
1171
1202
if (params_buffer == NULL ) {
1172
- LOG_ERROR (" %s alloc params backend buffer failed, num_tensors = %i " ,
1203
+ LOG_ERROR (" %s alloc params backend buffer failed, num_tensors = %zu " ,
1173
1204
get_desc ().c_str (),
1174
1205
num_tensors);
1175
1206
return false ;
1176
1207
}
1177
1208
size_t params_buffer_size = ggml_backend_buffer_get_size (params_buffer);
1178
- LOG_DEBUG (" %s params backend buffer size = % 6.2f MB(%s) (%i tensors)" ,
1209
+ LOG_DEBUG (" %s params backend buffer size = % 6.2f MB(%s) (%zu tensors)" ,
1179
1210
get_desc ().c_str (),
1180
1211
params_buffer_size / (1024.0 * 1024.0 ),
1181
1212
ggml_backend_is_cpu (backend) ? " RAM" : " VRAM" ,
1182
1213
num_tensors);
1183
- // printf("%s params backend buffer size = % 6.2f MB(%s) (%i tensors)\n",
1184
- // get_desc().c_str(),
1185
- // params_buffer_size / (1024.0 * 1024.0),
1186
- // ggml_backend_is_cpu(backend) ? "RAM" : "VRAM",
1187
- // num_tensors);
1188
1214
return true ;
1189
1215
}
1190
1216
@@ -1203,13 +1229,12 @@ struct GGMLRunner {
1203
1229
}
1204
1230
1205
1231
void free_compute_buffer () {
1206
- if (compute_allocr != NULL ) {
1207
- ggml_gallocr_free (compute_allocr );
1208
- compute_allocr = NULL ;
1232
+ if (compute_sched != NULL ) {
1233
+ ggml_backend_sched_free (compute_sched );
1234
+ compute_sched = NULL ;
1209
1235
}
1210
1236
}
1211
1237
1212
- // do copy after alloc graph
1213
1238
void set_backend_tensor_data (struct ggml_tensor * tensor, const void * data) {
1214
1239
backend_tensor_data_map[tensor] = data;
1215
1240
}
@@ -1219,11 +1244,12 @@ struct GGMLRunner {
1219
1244
if (tensor == NULL ) {
1220
1245
return NULL ;
1221
1246
}
1222
- // it's performing a compute, check if backend isn't cpu
1223
- if (!ggml_backend_is_cpu (backend) && (tensor->buffer == NULL || ggml_backend_buffer_is_host (tensor->buffer ))) {
1224
- // pass input tensors to gpu memory
1225
- auto backend_tensor = ggml_dup_tensor (compute_ctx, tensor);
1247
+ bool tensor_on_host_or_unmanaged = tensor->buffer == NULL || ggml_backend_buffer_is_host (tensor->buffer );
1248
+ bool is_param_tensor = false ;
1226
1249
1250
+ if (tensor_on_host_or_unmanaged && !is_param_tensor) {
1251
+ auto backend_tensor = ggml_dup_tensor (compute_ctx, tensor);
1252
+ ggml_set_name (backend_tensor, tensor->name );
1227
1253
set_backend_tensor_data (backend_tensor, tensor->data );
1228
1254
return backend_tensor;
1229
1255
} else {
@@ -1236,26 +1262,56 @@ struct GGMLRunner {
1236
1262
bool free_compute_buffer_immediately = true ,
1237
1263
struct ggml_tensor ** output = NULL ,
1238
1264
struct ggml_context * output_ctx = NULL ) {
1239
- alloc_compute_buffer (get_graph);
1240
- reset_compute_ctx ();
1241
- struct ggml_cgraph * gf = get_graph ();
1242
- GGML_ASSERT (ggml_gallocr_alloc_graph (compute_allocr, gf));
1265
+
1266
+ if (!alloc_compute_buffer (get_graph)) {
1267
+ LOG_ERROR (" %s: Failed to allocate/reserve compute buffer with scheduler." , get_desc ().c_str ());
1268
+ return ;
1269
+ }
1270
+
1271
+ reset_compute_ctx ();
1272
+ struct ggml_cgraph * gf = get_graph ();
1273
+
1274
+ GGML_ASSERT (compute_sched != NULL );
1275
+ ggml_backend_sched_reset (compute_sched);
1276
+
1277
+ if (!ggml_backend_sched_alloc_graph (compute_sched, gf)) {
1278
+ LOG_ERROR (" %s: ggml_backend_sched_alloc_graph failed\n " , get_desc ().c_str ());
1279
+ return ;
1280
+ }
1281
+
1243
1282
cpy_data_to_backend_tensor ();
1244
- if (ggml_backend_is_cpu (backend)) {
1245
- ggml_backend_cpu_set_n_threads (backend, n_threads);
1283
+
1284
+ if (ggml_backend_is_cpu (this ->backend )) {
1285
+ ggml_backend_cpu_set_n_threads (this ->backend , n_threads);
1286
+ } else if (this ->cpu_backend ) {
1287
+ ggml_backend_cpu_set_n_threads (this ->cpu_backend , n_threads);
1288
+ }
1289
+
1290
+ enum ggml_status status = ggml_backend_sched_graph_compute (compute_sched, gf);
1291
+ if (status != GGML_STATUS_SUCCESS) {
1292
+ LOG_ERROR (" %s: ggml_backend_sched_graph_compute failed with status %d (%s)\n " ,
1293
+ get_desc ().c_str (), status, ggml_status_to_string (status));
1294
+ return ;
1246
1295
}
1247
1296
1248
- ggml_backend_graph_compute (backend, gf);
1249
1297
#ifdef GGML_PERF
1250
- ggml_graph_print (gf);
1298
+ // ggml_graph_print(gf);
1251
1299
#endif
1252
- if (output != NULL ) {
1253
- auto result = ggml_graph_node (gf, -1 );
1300
+ if (output != NULL && ggml_graph_n_nodes (gf) > 0 ) {
1301
+ struct ggml_tensor * result_tensor_in_graph = ggml_graph_node (gf, ggml_graph_n_nodes (gf) - 1 );
1302
+
1254
1303
if (*output == NULL && output_ctx != NULL ) {
1255
- *output = ggml_dup_tensor (output_ctx, result);
1304
+ *output = ggml_dup_tensor (output_ctx, result_tensor_in_graph);
1256
1305
}
1257
1306
if (*output != NULL ) {
1258
- ggml_backend_tensor_get_and_sync (backend, result, (*output)->data , 0 , ggml_nbytes (*output));
1307
+ ggml_backend_t result_backend = ggml_backend_sched_get_tensor_backend (compute_sched, result_tensor_in_graph);
1308
+ if (result_backend == NULL ) {
1309
+ LOG_ERROR (" %s: Could not determine backend for result tensor %s\n " , get_desc ().c_str (), result_tensor_in_graph->name );
1310
+ } else {
1311
+ ggml_backend_tensor_get_and_sync (result_backend,
1312
+ result_tensor_in_graph,
1313
+ (*output)->data , 0 , ggml_nbytes (*output));
1314
+ }
1259
1315
}
1260
1316
}
1261
1317
0 commit comments