@@ -165,12 +165,16 @@ class llama_token_data_array(Structure):
165
165
# int32_t n_gpu_layers; // number of layers to store in VRAM
166
166
# int32_t main_gpu; // the GPU that is used for scratch and small tensors
167
167
# float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
168
+
169
+ # // ref: https://github.com/ggerganov/llama.cpp/pull/2054
170
+ # float rope_freq_base; // RoPE base frequency
171
+ # float rope_freq_scale; // RoPE frequency scaling factor
172
+
168
173
# // called with a progress value between 0 and 1, pass NULL to disable
169
174
# llama_progress_callback progress_callback;
170
175
# // context pointer passed to the progress callback
171
176
# void * progress_callback_user_data;
172
177
173
-
174
178
# // Keep the booleans together to avoid misalignment during copy-by-value.
175
179
# bool low_vram; // if true, reduce VRAM usage at the cost of performance
176
180
# bool f16_kv; // use fp16 for KV cache
@@ -190,6 +194,8 @@ class llama_context_params(Structure):
190
194
("n_gpu_layers" , c_int32 ),
191
195
("main_gpu" , c_int32 ),
192
196
("tensor_split" , c_float * LLAMA_MAX_DEVICES .value ),
197
+ ("rope_freq_base" , c_float ),
198
+ ("rope_freq_scale" , c_float ),
193
199
("progress_callback" , llama_progress_callback ),
194
200
("progress_callback_user_data" , c_void_p ),
195
201
("low_vram" , c_bool ),
@@ -328,13 +334,23 @@ def llama_mlock_supported() -> bool:
328
334
# // Initialize the llama + ggml backend
329
335
# // If numa is true, use NUMA optimizations
330
336
# // Call once at the start of the program
331
- # LLAMA_API void llama_init_backend(bool numa);
332
- def llama_init_backend (numa : c_bool ):
333
- return _lib .llama_init_backend (numa )
337
+ # LLAMA_API void llama_backend_init(bool numa);
338
+ def llama_backend_init (numa : c_bool ):
339
+ return _lib .llama_backend_init (numa )
340
+
341
+
342
+ _lib .llama_backend_init .argtypes = [c_bool ]
343
+ _lib .llama_backend_init .restype = None
344
+
334
345
346
+ # // Call once at the end of the program - currently only used for MPI
347
+ # LLAMA_API void llama_backend_free();
348
+ def llama_backend_free ():
349
+ return _lib .llama_backend_free ()
335
350
336
- _lib .llama_init_backend .argtypes = [c_bool ]
337
- _lib .llama_init_backend .restype = None
351
+
352
+ _lib .llama_backend_free .argtypes = []
353
+ _lib .llama_backend_free .restype = None
338
354
339
355
340
356
# LLAMA_API struct llama_model * llama_load_model_from_file(
@@ -648,6 +664,22 @@ def llama_tokenize(
648
664
_lib .llama_tokenize .restype = c_int
649
665
650
666
667
+ # LLAMA_API int llama_tokenize_with_model(
668
+ # const struct llama_model * model,
669
+ # const char * text,
670
+ # llama_token * tokens,
671
+ # int n_max_tokens,
672
+ # bool add_bos);
673
+ def llama_tokenize_with_model (
674
+ model : llama_model_p ,
675
+ text : bytes ,
676
+ tokens , # type: Array[llama_token]
677
+ n_max_tokens : c_int ,
678
+ add_bos : c_bool ,
679
+ ) -> int :
680
+ return _lib .llama_tokenize_with_model (model , text , tokens , n_max_tokens , add_bos )
681
+
682
+
651
683
# LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
652
684
def llama_n_vocab (ctx : llama_context_p ) -> int :
653
685
return _lib .llama_n_vocab (ctx )
@@ -675,6 +707,33 @@ def llama_n_embd(ctx: llama_context_p) -> int:
675
707
_lib .llama_n_embd .restype = c_int
676
708
677
709
710
+ # LLAMA_API int llama_n_vocab_from_model(const struct llama_model * model);
711
+ def llama_n_vocab_from_model (model : llama_model_p ) -> int :
712
+ return _lib .llama_n_vocab_from_model (model )
713
+
714
+
715
+ _lib .llama_n_vocab_from_model .argtypes = [llama_model_p ]
716
+ _lib .llama_n_vocab_from_model .restype = c_int
717
+
718
+
719
+ # LLAMA_API int llama_n_ctx_from_model (const struct llama_model * model);
720
+ def llama_n_ctx_from_model (model : llama_model_p ) -> int :
721
+ return _lib .llama_n_ctx_from_model (model )
722
+
723
+
724
+ _lib .llama_n_ctx_from_model .argtypes = [llama_model_p ]
725
+ _lib .llama_n_ctx_from_model .restype = c_int
726
+
727
+
728
+ # LLAMA_API int llama_n_embd_from_model (const struct llama_model * model);
729
+ def llama_n_embd_from_model (model : llama_model_p ) -> int :
730
+ return _lib .llama_n_embd_from_model (model )
731
+
732
+
733
+ _lib .llama_n_embd_from_model .argtypes = [llama_model_p ]
734
+ _lib .llama_n_embd_from_model .restype = c_int
735
+
736
+
678
737
# // Get the vocabulary as output parameters.
679
738
# // Returns number of results.
680
739
# LLAMA_API int llama_get_vocab(
@@ -695,6 +754,20 @@ def llama_get_vocab(
695
754
_lib .llama_get_vocab .restype = c_int
696
755
697
756
757
+ # LLAMA_API int llama_get_vocab_from_model(
758
+ # const struct llama_model * model,
759
+ # const char * * strings,
760
+ # float * scores,
761
+ # int capacity);
762
+ def llama_get_vocab_from_model (
763
+ model : llama_model_p ,
764
+ strings , # type: Array[c_char_p] # type: ignore
765
+ scores , # type: Array[c_float] # type: ignore
766
+ capacity : c_int ,
767
+ ) -> int :
768
+ return _lib .llama_get_vocab_from_model (model , strings , scores , capacity )
769
+
770
+
698
771
# Token logits obtained from the last call to llama_eval()
699
772
# The logits for the last token are stored in the last row
700
773
# Can be mutated in order to change the probabilities of the next token
@@ -724,15 +797,28 @@ def llama_get_embeddings(
724
797
_lib .llama_get_embeddings .restype = c_float_p
725
798
726
799
727
- # Token Id -> String. Uses the vocabulary in the provided context
728
- # LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
800
+ # // Token Id -> String. Uses the vocabulary in the provided context
801
+ # LLAMA_API const char * llama_token_to_str(
802
+ # const struct llama_context * ctx,
803
+ # llama_token token);
729
804
def llama_token_to_str (ctx : llama_context_p , token : llama_token ) -> bytes :
730
805
return _lib .llama_token_to_str (ctx , token )
731
806
732
807
733
808
_lib .llama_token_to_str .argtypes = [llama_context_p , llama_token ]
734
809
_lib .llama_token_to_str .restype = c_char_p
735
810
811
+
812
+ # LLAMA_API const char * llama_token_to_str_with_model(
813
+ # const struct llama_model * model,
814
+ # llama_token token);
815
+ def llama_token_to_str_with_model (model : llama_model_p , token : llama_token ) -> bytes :
816
+ return _lib .llama_token_to_str_with_model (model , token )
817
+
818
+
819
+ _lib .llama_token_to_str_with_model .argtypes = [llama_model_p , llama_token ]
820
+ _lib .llama_token_to_str_with_model .restype = c_char_p
821
+
736
822
# Special tokens
737
823
738
824
@@ -821,6 +907,39 @@ def llama_sample_frequency_and_presence_penalties(
821
907
_lib .llama_sample_frequency_and_presence_penalties .restype = None
822
908
823
909
910
+ # /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
911
+ # /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
912
+ # /// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
913
+ # /// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
914
+ # /// @params smooth_factor Smooth factor between guidance logits and original logits. 1.0f means only use guidance logits. 0.0f means only original logits.
915
+ # LLAMA_API void llama_sample_classifier_free_guidance(
916
+ # struct llama_context * ctx,
917
+ # llama_token_data_array * candidates,
918
+ # struct llama_context * guidance_ctx,
919
+ # float scale,
920
+ # float smooth_factor);
921
+ def llama_sample_classifier_free_guidance (
922
+ ctx : llama_context_p ,
923
+ candidates , # type: _Pointer[llama_token_data_array]
924
+ guidance_ctx : llama_context_p ,
925
+ scale : c_float ,
926
+ smooth_factor : c_float ,
927
+ ):
928
+ return _lib .llama_sample_classifier_free_guidance (
929
+ ctx , candidates , guidance_ctx , scale , smooth_factor
930
+ )
931
+
932
+
933
+ _lib .llama_sample_classifier_free_guidance .argtypes = [
934
+ llama_context_p ,
935
+ llama_token_data_array_p ,
936
+ llama_context_p ,
937
+ c_float ,
938
+ c_float ,
939
+ ]
940
+ _lib .llama_sample_classifier_free_guidance .restype = None
941
+
942
+
824
943
# @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
825
944
# LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
826
945
def llama_sample_softmax (
@@ -1065,5 +1184,5 @@ def llama_print_system_info() -> bytes:
1065
1184
_llama_initialized = False
1066
1185
1067
1186
if not _llama_initialized :
1068
- llama_init_backend (c_bool (False ))
1187
+ llama_backend_init (c_bool (False ))
1069
1188
_llama_initialized = True
0 commit comments