Skip to content

Commit 642d79e

Browse files
committed
kv-cache : remove LLAMA_SET_ROWS checks
ggml-ci
1 parent b730706 commit 642d79e

File tree

7 files changed

+20
-138
lines changed

7 files changed

+20
-138
lines changed

ggml/src/ggml-cann/common.h

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -378,7 +378,6 @@ struct ggml_backend_cann_context {
378378
#endif
379379
cann_task_queue task_queue;
380380
bool async_mode;
381-
bool support_set_rows;
382381
void* f32_zero_cache = nullptr;
383382
void* f32_one_cache = nullptr;
384383
int64_t f32_zero_cache_element = 0;
@@ -398,14 +397,6 @@ struct ggml_backend_cann_context {
398397
async_mode = parse_bool(get_env("GGML_CANN_ASYNC_MODE").value_or(""));
399398
GGML_LOG_INFO("%s: device %d async operator submission is %s\n", __func__,
400399
device, async_mode ? "ON" : "OFF");
401-
402-
support_set_rows = parse_bool(get_env("LLAMA_SET_ROWS").value_or(""));
403-
GGML_LOG_INFO("%s: LLAMA_SET_ROWS is %s\n", __func__, support_set_rows ? "ON" : "OFF");
404-
405-
if (!support_set_rows) {
406-
GGML_LOG_INFO("%s: CANN Graph currently only supports execution when LLAMA_SET_ROWS is ON. "
407-
"Falling back to eager mode.\n", __func__);
408-
}
409400
}
410401

411402
/**

ggml/src/ggml-cann/ggml-cann.cpp

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2251,11 +2251,6 @@ static enum ggml_status ggml_backend_cann_graph_compute(
22512251
bool use_cann_graph = true;
22522252
bool cann_graph_update_required = false;
22532253

2254-
// check environment LLAMA_SET_ROWS
2255-
if (!cann_ctx->support_set_rows) {
2256-
use_cann_graph = false;
2257-
}
2258-
22592254
if (use_cann_graph) {
22602255
if (cann_ctx->cann_graph == nullptr) {
22612256
cann_ctx->cann_graph.reset(new ggml_cann_graph());

src/llama-context.cpp

Lines changed: 0 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -102,16 +102,6 @@ llama_context::llama_context(
102102
cparams.op_offload = params.op_offload;
103103
cparams.kv_unified = params.kv_unified;
104104

105-
{
106-
const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
107-
supports_set_rows = LLAMA_SET_ROWS ? (atoi(LLAMA_SET_ROWS) != 0) : supports_set_rows;
108-
109-
if (!supports_set_rows && !cparams.kv_unified) {
110-
LLAMA_LOG_WARN("%s: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache\n", __func__);
111-
cparams.kv_unified = true;
112-
}
113-
}
114-
115105
{
116106
const char * LLAMA_GRAPH_REUSE_DISABLE = getenv("LLAMA_GRAPH_REUSE_DISABLE");
117107
graph_reuse_disable = LLAMA_GRAPH_REUSE_DISABLE ? (atoi(LLAMA_GRAPH_REUSE_DISABLE) != 0) : graph_reuse_disable;
@@ -888,12 +878,6 @@ int llama_context::encode(const llama_batch & batch_inp) {
888878
}
889879
}
890880

891-
if (!supports_set_rows) {
892-
// Reset state for the next token before backend sync, to allow the CPU activities in the reset to
893-
// overlap with device computation.
894-
ggml_backend_sched_reset(sched.get());
895-
}
896-
897881
// TODO: hacky solution
898882
if (model.arch == LLM_ARCH_T5 && t_embd) {
899883
//cross.t_embd = t_embd;
@@ -1224,12 +1208,6 @@ int llama_context::decode(const llama_batch & batch_inp) {
12241208
// wait for the computation to finish (automatically done when obtaining the model output)
12251209
//synchronize();
12261210

1227-
if (!supports_set_rows) {
1228-
// Reset state for the next token before backend sync, to allow the CPU activities in the reset to
1229-
// overlap with device computation.
1230-
ggml_backend_sched_reset(sched.get());
1231-
}
1232-
12331211
return 0;
12341212
}
12351213

src/llama-context.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -283,10 +283,6 @@ struct llama_context {
283283

284284
bool has_evaluated_once = false;
285285

286-
// env: LLAMA_SET_ROWS (temporary)
287-
// ref: https://github.com/ggml-org/llama.cpp/pull/14285
288-
bool supports_set_rows = true;
289-
290286
// env: LLAMA_GRAPH_REUSE_DISABLE
291287
bool graph_reuse_disable = false;
292288

src/llama-graph.cpp

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -314,8 +314,6 @@ bool llm_graph_input_attn_kv::can_reuse(const llm_graph_params & params) {
314314
res &= self_kq_mask->ne[0] == mctx->get_n_kv();
315315
res &= self_kq_mask->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
316316

317-
res &= mctx->get_supports_set_rows(); // TODO: tmp
318-
319317
return res;
320318
}
321319

@@ -350,8 +348,6 @@ bool llm_graph_input_attn_kv_iswa::can_reuse(const llm_graph_params & params) {
350348
res &= self_kq_mask_swa->ne[0] == mctx->get_swa()->get_n_kv();
351349
res &= self_kq_mask_swa->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
352350

353-
res &= mctx->get_base()->get_supports_set_rows(); // TODO: tmp
354-
355351
return res;
356352
}
357353

src/llama-kv-cache.cpp

Lines changed: 20 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -197,18 +197,6 @@ llama_kv_cache::llama_kv_cache(
197197

198198
const char * LLAMA_KV_CACHE_DEBUG = getenv("LLAMA_KV_CACHE_DEBUG");
199199
debug = LLAMA_KV_CACHE_DEBUG ? atoi(LLAMA_KV_CACHE_DEBUG) : 0;
200-
201-
const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
202-
supports_set_rows = LLAMA_SET_ROWS ? atoi(LLAMA_SET_ROWS) != 0 : supports_set_rows;
203-
204-
if (!supports_set_rows) {
205-
// ref: https://github.com/ggml-org/llama.cpp/pull/14363
206-
GGML_ASSERT(unified && "cannot use non-unified KV cache without ggml_set_rows() support");
207-
}
208-
209-
if (!supports_set_rows) {
210-
LLAMA_LOG_WARN("%s: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility\n", __func__);
211-
}
212200
}
213201

214202
void llama_kv_cache::clear(bool data) {
@@ -551,11 +539,8 @@ llama_kv_cache::slot_info_vec_t llama_kv_cache::prepare(const std::vector<llama_
551539
bool success = true;
552540

553541
for (const auto & ubatch : ubatches) {
554-
// non-continuous slots require support for ggml_set_rows()
555-
const bool cont = supports_set_rows ? false : true;
556-
557542
// only find a suitable slot for the ubatch. don't modify the cells yet
558-
const auto sinfo_new = find_slot(ubatch, cont);
543+
const auto sinfo_new = find_slot(ubatch, true);
559544
if (sinfo_new.empty()) {
560545
success = false;
561546
break;
@@ -976,10 +961,6 @@ uint32_t llama_kv_cache::get_n_kv() const {
976961
return result;
977962
}
978963

979-
bool llama_kv_cache::get_supports_set_rows() const {
980-
return supports_set_rows;
981-
}
982-
983964
ggml_tensor * llama_kv_cache::get_k(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const {
984965
const int32_t ikv = map_layer_ids.at(il);
985966

@@ -1033,36 +1014,26 @@ ggml_tensor * llama_kv_cache::get_v(ggml_context * ctx, int32_t il, uint32_t n_k
10331014
}
10341015

10351016
ggml_tensor * llama_kv_cache::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il, const slot_info & sinfo) const {
1017+
GGML_UNUSED(sinfo);
1018+
10361019
const int32_t ikv = map_layer_ids.at(il);
10371020

10381021
auto * k = layers[ikv].k;
10391022

1040-
const int64_t n_embd_k_gqa = k->ne[0];
10411023
const int64_t n_tokens = k_cur->ne[2];
10421024

10431025
k_cur = ggml_reshape_2d(ctx, k_cur, k->ne[0], n_tokens);
10441026

1045-
if (k_idxs && supports_set_rows) {
1046-
if (k->ne[2] > 1) {
1047-
k = ggml_reshape_2d(ctx, k, k->ne[0], k->ne[1]*k->ne[2]);
1048-
}
1049-
1050-
return ggml_set_rows(ctx, k, k_cur, k_idxs);
1027+
if (k->ne[2] > 1) {
1028+
k = ggml_reshape_2d(ctx, k, k->ne[0], k->ne[1]*k->ne[2]);
10511029
}
10521030

1053-
// TODO: fallback to old ggml_cpy() method for backwards compatibility
1054-
// will be removed when ggml_set_rows() is adopted by all backends
1055-
1056-
GGML_ASSERT(n_stream == 1 && "n_stream > 1 not supported without LLAMA_SET_ROWS");
1057-
1058-
ggml_tensor * k_view = ggml_view_1d(ctx, k,
1059-
n_tokens*n_embd_k_gqa,
1060-
ggml_row_size(k->type, n_embd_k_gqa)*sinfo.head());
1061-
1062-
return ggml_cpy(ctx, k_cur, k_view);
1031+
return ggml_set_rows(ctx, k, k_cur, k_idxs);
10631032
}
10641033

10651034
ggml_tensor * llama_kv_cache::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggml_tensor * v_idxs, int32_t il, const slot_info & sinfo) const {
1035+
GGML_UNUSED(sinfo);
1036+
10661037
const int32_t ikv = map_layer_ids.at(il);
10671038

10681039
auto * v = layers[ikv].v;
@@ -1072,48 +1043,25 @@ ggml_tensor * llama_kv_cache::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggm
10721043

10731044
v_cur = ggml_reshape_2d(ctx, v_cur, n_embd_v_gqa, n_tokens);
10741045

1075-
if (v_idxs && supports_set_rows) {
1076-
if (!v_trans) {
1077-
if (v->ne[2] > 1) {
1078-
v = ggml_reshape_2d(ctx, v, v->ne[0], v->ne[1]*v->ne[2]);
1079-
}
1080-
1081-
return ggml_set_rows(ctx, v, v_cur, v_idxs);
1082-
}
1083-
1084-
// [TAG_V_CACHE_VARIABLE]
1085-
if (n_embd_v_gqa < v->ne[0]) {
1086-
v_cur = ggml_pad(ctx, v_cur, v->ne[0] - n_embd_v_gqa, 0, 0, 0);
1046+
if (!v_trans) {
1047+
if (v->ne[2] > 1) {
1048+
v = ggml_reshape_2d(ctx, v, v->ne[0], v->ne[1]*v->ne[2]);
10871049
}
10881050

1089-
// the row becomes a single element
1090-
ggml_tensor * v_view = ggml_reshape_2d(ctx, v, 1, v->ne[0]*v->ne[1]*v->ne[2]);
1091-
1092-
v_cur = ggml_reshape_2d(ctx, v_cur, 1, v_cur->ne[0]*v_cur->ne[1]);
1093-
1094-
return ggml_set_rows(ctx, v_view, v_cur, v_idxs);
1051+
return ggml_set_rows(ctx, v, v_cur, v_idxs);
10951052
}
10961053

1097-
// TODO: fallback to old ggml_cpy() method for backwards compatibility
1098-
// will be removed when ggml_set_rows() is adopted by all backends
1099-
1100-
GGML_ASSERT(n_stream == 1 && "n_stream > 1 not supported without LLAMA_SET_ROWS");
1054+
// [TAG_V_CACHE_VARIABLE]
1055+
if (n_embd_v_gqa < v->ne[0]) {
1056+
v_cur = ggml_pad(ctx, v_cur, v->ne[0] - n_embd_v_gqa, 0, 0, 0);
1057+
}
11011058

1102-
ggml_tensor * v_view = nullptr;
1059+
// the row becomes a single element
1060+
ggml_tensor * v_view = ggml_reshape_2d(ctx, v, 1, v->ne[0]*v->ne[1]*v->ne[2]);
11031061

1104-
if (!v_trans) {
1105-
v_view = ggml_view_1d(ctx, v,
1106-
n_tokens*n_embd_v_gqa,
1107-
ggml_row_size(v->type, n_embd_v_gqa)*sinfo.head());
1108-
} else {
1109-
v_cur = ggml_transpose(ctx, v_cur);
1062+
v_cur = ggml_reshape_2d(ctx, v_cur, 1, v_cur->ne[0]*v_cur->ne[1]);
11101063

1111-
v_view = ggml_view_2d(ctx, v, n_tokens, n_embd_v_gqa,
1112-
(v->ne[1] )*ggml_element_size(v),
1113-
(sinfo.head())*ggml_element_size(v));
1114-
}
1115-
1116-
return ggml_cpy(ctx, v_cur, v_view);
1064+
return ggml_set_rows(ctx, v_view, v_cur, v_idxs);
11171065
}
11181066

11191067
ggml_tensor * llama_kv_cache::build_input_k_idxs(ggml_context * ctx, const llama_ubatch & ubatch) const {
@@ -1143,10 +1091,6 @@ ggml_tensor * llama_kv_cache::build_input_v_idxs(ggml_context * ctx, const llama
11431091
}
11441092

11451093
void llama_kv_cache::set_input_k_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const {
1146-
if (!supports_set_rows) {
1147-
return;
1148-
}
1149-
11501094
const uint32_t n_tokens = ubatch->n_tokens;
11511095
GGML_ASSERT(n_tokens == (int64_t) sinfo.size()*sinfo.n_stream());
11521096

@@ -1163,10 +1107,6 @@ void llama_kv_cache::set_input_k_idxs(ggml_tensor * dst, const llama_ubatch * ub
11631107
}
11641108

11651109
void llama_kv_cache::set_input_v_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const {
1166-
if (!supports_set_rows) {
1167-
return;
1168-
}
1169-
11701110
const uint32_t n_tokens = ubatch->n_tokens;
11711111
GGML_ASSERT(n_tokens == (int64_t) sinfo.size()*sinfo.n_stream());
11721112

@@ -2005,10 +1945,6 @@ uint32_t llama_kv_cache_context::get_n_kv() const {
20051945
return n_kv;
20061946
}
20071947

2008-
bool llama_kv_cache_context::get_supports_set_rows() const {
2009-
return kv->get_supports_set_rows();
2010-
}
2011-
20121948
ggml_tensor * llama_kv_cache_context::get_k(ggml_context * ctx, int32_t il) const {
20131949
return kv->get_k(ctx, il, n_kv, sinfos[i_cur]);
20141950
}

src/llama-kv-cache.h

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -141,9 +141,6 @@ class llama_kv_cache : public llama_memory_i {
141141

142142
uint32_t get_n_kv() const;
143143

144-
// TODO: temporary
145-
bool get_supports_set_rows() const;
146-
147144
// get views of the current state of the cache
148145
ggml_tensor * get_k(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const;
149146
ggml_tensor * get_v(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const;
@@ -215,10 +212,6 @@ class llama_kv_cache : public llama_memory_i {
215212
// env: LLAMA_KV_CACHE_DEBUG
216213
int debug = 0;
217214

218-
// env: LLAMA_SET_ROWS (temporary)
219-
// ref: https://github.com/ggml-org/llama.cpp/pull/14285
220-
bool supports_set_rows = true;
221-
222215
const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
223216

224217
std::vector<ggml_context_ptr> ctxs;
@@ -318,9 +311,6 @@ class llama_kv_cache_context : public llama_memory_context_i {
318311

319312
uint32_t get_n_kv() const;
320313

321-
// TODO: temporary
322-
bool get_supports_set_rows() const;
323-
324314
// get views of the current state of the cache
325315
ggml_tensor * get_k(ggml_context * ctx, int32_t il) const;
326316
ggml_tensor * get_v(ggml_context * ctx, int32_t il) const;

0 commit comments

Comments
 (0)