@@ -197,18 +197,6 @@ llama_kv_cache::llama_kv_cache(
197
197
198
198
const char * LLAMA_KV_CACHE_DEBUG = getenv (" LLAMA_KV_CACHE_DEBUG" );
199
199
debug = LLAMA_KV_CACHE_DEBUG ? atoi (LLAMA_KV_CACHE_DEBUG) : 0 ;
200
-
201
- const char * LLAMA_SET_ROWS = getenv (" LLAMA_SET_ROWS" );
202
- supports_set_rows = LLAMA_SET_ROWS ? atoi (LLAMA_SET_ROWS) != 0 : supports_set_rows;
203
-
204
- if (!supports_set_rows) {
205
- // ref: https://github.com/ggml-org/llama.cpp/pull/14363
206
- GGML_ASSERT (unified && " cannot use non-unified KV cache without ggml_set_rows() support" );
207
- }
208
-
209
- if (!supports_set_rows) {
210
- LLAMA_LOG_WARN (" %s: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility\n " , __func__);
211
- }
212
200
}
213
201
214
202
void llama_kv_cache::clear (bool data) {
@@ -551,11 +539,8 @@ llama_kv_cache::slot_info_vec_t llama_kv_cache::prepare(const std::vector<llama_
551
539
bool success = true ;
552
540
553
541
for (const auto & ubatch : ubatches) {
554
- // non-continuous slots require support for ggml_set_rows()
555
- const bool cont = supports_set_rows ? false : true ;
556
-
557
542
// only find a suitable slot for the ubatch. don't modify the cells yet
558
- const auto sinfo_new = find_slot (ubatch, cont );
543
+ const auto sinfo_new = find_slot (ubatch, true );
559
544
if (sinfo_new.empty ()) {
560
545
success = false ;
561
546
break ;
@@ -976,10 +961,6 @@ uint32_t llama_kv_cache::get_n_kv() const {
976
961
return result;
977
962
}
978
963
979
- bool llama_kv_cache::get_supports_set_rows () const {
980
- return supports_set_rows;
981
- }
982
-
983
964
ggml_tensor * llama_kv_cache::get_k (ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const {
984
965
const int32_t ikv = map_layer_ids.at (il);
985
966
@@ -1033,36 +1014,26 @@ ggml_tensor * llama_kv_cache::get_v(ggml_context * ctx, int32_t il, uint32_t n_k
1033
1014
}
1034
1015
1035
1016
ggml_tensor * llama_kv_cache::cpy_k (ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il, const slot_info & sinfo) const {
1017
+ GGML_UNUSED (sinfo);
1018
+
1036
1019
const int32_t ikv = map_layer_ids.at (il);
1037
1020
1038
1021
auto * k = layers[ikv].k ;
1039
1022
1040
- const int64_t n_embd_k_gqa = k->ne [0 ];
1041
1023
const int64_t n_tokens = k_cur->ne [2 ];
1042
1024
1043
1025
k_cur = ggml_reshape_2d (ctx, k_cur, k->ne [0 ], n_tokens);
1044
1026
1045
- if (k_idxs && supports_set_rows) {
1046
- if (k->ne [2 ] > 1 ) {
1047
- k = ggml_reshape_2d (ctx, k, k->ne [0 ], k->ne [1 ]*k->ne [2 ]);
1048
- }
1049
-
1050
- return ggml_set_rows (ctx, k, k_cur, k_idxs);
1027
+ if (k->ne [2 ] > 1 ) {
1028
+ k = ggml_reshape_2d (ctx, k, k->ne [0 ], k->ne [1 ]*k->ne [2 ]);
1051
1029
}
1052
1030
1053
- // TODO: fallback to old ggml_cpy() method for backwards compatibility
1054
- // will be removed when ggml_set_rows() is adopted by all backends
1055
-
1056
- GGML_ASSERT (n_stream == 1 && " n_stream > 1 not supported without LLAMA_SET_ROWS" );
1057
-
1058
- ggml_tensor * k_view = ggml_view_1d (ctx, k,
1059
- n_tokens*n_embd_k_gqa,
1060
- ggml_row_size (k->type , n_embd_k_gqa)*sinfo.head ());
1061
-
1062
- return ggml_cpy (ctx, k_cur, k_view);
1031
+ return ggml_set_rows (ctx, k, k_cur, k_idxs);
1063
1032
}
1064
1033
1065
1034
ggml_tensor * llama_kv_cache::cpy_v (ggml_context * ctx, ggml_tensor * v_cur, ggml_tensor * v_idxs, int32_t il, const slot_info & sinfo) const {
1035
+ GGML_UNUSED (sinfo);
1036
+
1066
1037
const int32_t ikv = map_layer_ids.at (il);
1067
1038
1068
1039
auto * v = layers[ikv].v ;
@@ -1072,48 +1043,25 @@ ggml_tensor * llama_kv_cache::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, ggm
1072
1043
1073
1044
v_cur = ggml_reshape_2d (ctx, v_cur, n_embd_v_gqa, n_tokens);
1074
1045
1075
- if (v_idxs && supports_set_rows) {
1076
- if (!v_trans) {
1077
- if (v->ne [2 ] > 1 ) {
1078
- v = ggml_reshape_2d (ctx, v, v->ne [0 ], v->ne [1 ]*v->ne [2 ]);
1079
- }
1080
-
1081
- return ggml_set_rows (ctx, v, v_cur, v_idxs);
1082
- }
1083
-
1084
- // [TAG_V_CACHE_VARIABLE]
1085
- if (n_embd_v_gqa < v->ne [0 ]) {
1086
- v_cur = ggml_pad (ctx, v_cur, v->ne [0 ] - n_embd_v_gqa, 0 , 0 , 0 );
1046
+ if (!v_trans) {
1047
+ if (v->ne [2 ] > 1 ) {
1048
+ v = ggml_reshape_2d (ctx, v, v->ne [0 ], v->ne [1 ]*v->ne [2 ]);
1087
1049
}
1088
1050
1089
- // the row becomes a single element
1090
- ggml_tensor * v_view = ggml_reshape_2d (ctx, v, 1 , v->ne [0 ]*v->ne [1 ]*v->ne [2 ]);
1091
-
1092
- v_cur = ggml_reshape_2d (ctx, v_cur, 1 , v_cur->ne [0 ]*v_cur->ne [1 ]);
1093
-
1094
- return ggml_set_rows (ctx, v_view, v_cur, v_idxs);
1051
+ return ggml_set_rows (ctx, v, v_cur, v_idxs);
1095
1052
}
1096
1053
1097
- // TODO: fallback to old ggml_cpy() method for backwards compatibility
1098
- // will be removed when ggml_set_rows() is adopted by all backends
1099
-
1100
- GGML_ASSERT (n_stream == 1 && " n_stream > 1 not supported without LLAMA_SET_ROWS " );
1054
+ // [TAG_V_CACHE_VARIABLE]
1055
+ if (n_embd_v_gqa < v-> ne [ 0 ]) {
1056
+ v_cur = ggml_pad (ctx, v_cur, v-> ne [ 0 ] - n_embd_v_gqa, 0 , 0 , 0 );
1057
+ }
1101
1058
1102
- ggml_tensor * v_view = nullptr ;
1059
+ // the row becomes a single element
1060
+ ggml_tensor * v_view = ggml_reshape_2d (ctx, v, 1 , v->ne [0 ]*v->ne [1 ]*v->ne [2 ]);
1103
1061
1104
- if (!v_trans) {
1105
- v_view = ggml_view_1d (ctx, v,
1106
- n_tokens*n_embd_v_gqa,
1107
- ggml_row_size (v->type , n_embd_v_gqa)*sinfo.head ());
1108
- } else {
1109
- v_cur = ggml_transpose (ctx, v_cur);
1062
+ v_cur = ggml_reshape_2d (ctx, v_cur, 1 , v_cur->ne [0 ]*v_cur->ne [1 ]);
1110
1063
1111
- v_view = ggml_view_2d (ctx, v, n_tokens, n_embd_v_gqa,
1112
- (v->ne [1 ] )*ggml_element_size (v),
1113
- (sinfo.head ())*ggml_element_size (v));
1114
- }
1115
-
1116
- return ggml_cpy (ctx, v_cur, v_view);
1064
+ return ggml_set_rows (ctx, v_view, v_cur, v_idxs);
1117
1065
}
1118
1066
1119
1067
ggml_tensor * llama_kv_cache::build_input_k_idxs (ggml_context * ctx, const llama_ubatch & ubatch) const {
@@ -1143,10 +1091,6 @@ ggml_tensor * llama_kv_cache::build_input_v_idxs(ggml_context * ctx, const llama
1143
1091
}
1144
1092
1145
1093
void llama_kv_cache::set_input_k_idxs (ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const {
1146
- if (!supports_set_rows) {
1147
- return ;
1148
- }
1149
-
1150
1094
const uint32_t n_tokens = ubatch->n_tokens ;
1151
1095
GGML_ASSERT (n_tokens == (int64_t ) sinfo.size ()*sinfo.n_stream ());
1152
1096
@@ -1163,10 +1107,6 @@ void llama_kv_cache::set_input_k_idxs(ggml_tensor * dst, const llama_ubatch * ub
1163
1107
}
1164
1108
1165
1109
void llama_kv_cache::set_input_v_idxs (ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const {
1166
- if (!supports_set_rows) {
1167
- return ;
1168
- }
1169
-
1170
1110
const uint32_t n_tokens = ubatch->n_tokens ;
1171
1111
GGML_ASSERT (n_tokens == (int64_t ) sinfo.size ()*sinfo.n_stream ());
1172
1112
@@ -2005,10 +1945,6 @@ uint32_t llama_kv_cache_context::get_n_kv() const {
2005
1945
return n_kv;
2006
1946
}
2007
1947
2008
- bool llama_kv_cache_context::get_supports_set_rows () const {
2009
- return kv->get_supports_set_rows ();
2010
- }
2011
-
2012
1948
ggml_tensor * llama_kv_cache_context::get_k (ggml_context * ctx, int32_t il) const {
2013
1949
return kv->get_k (ctx, il, n_kv, sinfos[i_cur]);
2014
1950
}
0 commit comments