Skip to content

Commit f0797a6

Browse files
committed
Merge branch main into custom_rope
1 parent 3f8f276 commit f0797a6

File tree

8 files changed

+212
-69
lines changed

8 files changed

+212
-69
lines changed

CHANGELOG.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99

10+
## [0.1.71]
11+
12+
### Added
13+
14+
- (llama.cpp) Update llama.cpp
15+
16+
### Fixed
17+
18+
- (server) Fix several pydantic v2 migration bugs
19+
1020
## [0.1.70]
1121

1222
### Fixed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,7 @@ A Docker image is available on [GHCR](https://ghcr.io/abetlen/llama-cpp-python).
135135
```bash
136136
docker run --rm -it -p 8000:8000 -v /path/to/models:/models -e MODEL=/models/ggml-model-name.bin ghcr.io/abetlen/llama-cpp-python:latest
137137
```
138+
[Docker on termux (requires root)](https://gist.github.com/FreddieOliveira/efe850df7ff3951cb62d74bd770dce27) is currently the only known way to run this on phones, see [termux support issue](https://github.com/abetlen/llama-cpp-python/issues/389)
138139

139140
## Low-level API
140141

llama_cpp/llama.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,14 +19,14 @@
1919
from collections import deque, OrderedDict
2020

2121
import diskcache
22+
import ctypes
2223

2324
from . import llama_cpp
2425
from .llama_types import *
2526

2627
import numpy as np
2728
import numpy.typing as npt
2829

29-
3030
class BaseLlamaCache(ABC):
3131
"""Base cache class for a llama.cpp model."""
3232

@@ -222,6 +222,7 @@ def __init__(
222222
lora_base: Optional[str] = None,
223223
lora_path: Optional[str] = None,
224224
low_vram: bool = False,
225+
tensor_split: Optional[List[float]] = None,
225226
verbose: bool = True,
226227
):
227228
"""Load a llama.cpp model from `model_path`.
@@ -244,6 +245,7 @@ def __init__(
244245
last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
245246
lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.
246247
lora_path: Path to a LoRA file to apply to the model.
248+
tensor_split: List of floats to split the model across multiple GPUs. If None, the model is not split.
247249
verbose: Print verbose output to stderr.
248250
249251
Raises:
@@ -252,6 +254,7 @@ def __init__(
252254
Returns:
253255
A Llama instance.
254256
"""
257+
255258
self.verbose = verbose
256259
self.model_path = model_path
257260

@@ -269,6 +272,15 @@ def __init__(
269272
self.params.embedding = embedding
270273
self.params.low_vram = low_vram
271274

275+
self.tensor_split = tensor_split
276+
self._c_tensor_split = None
277+
278+
if self.tensor_split is not None:
279+
#Type conversion and expand the list to the length of LLAMA_MAX_DEVICES
280+
FloatArray = ctypes.c_float * llama_cpp.LLAMA_MAX_DEVICES.value
281+
self._c_tensor_split = FloatArray(*tensor_split) # keep a reference to the array so it is not gc'd
282+
self.params.tensor_split = self._c_tensor_split
283+
272284
self.last_n_tokens_size = last_n_tokens_size
273285
self.n_batch = min(n_ctx, n_batch)
274286

@@ -1509,6 +1521,7 @@ def __getstate__(self):
15091521
n_threads=self.n_threads,
15101522
lora_base=self.lora_base,
15111523
lora_path=self.lora_path,
1524+
tensor_split=self.tensor_split,
15121525
### DEPRECATED ###
15131526
n_parts=self.n_parts,
15141527
### DEPRECATED ###
@@ -1533,6 +1546,7 @@ def __setstate__(self, state):
15331546
last_n_tokens_size=state["last_n_tokens_size"],
15341547
lora_base=state["lora_base"],
15351548
lora_path=state["lora_path"],
1549+
tensor_split=state["tensor_split"],
15361550
verbose=state["verbose"],
15371551
)
15381552

llama_cpp/llama_cpp.py

Lines changed: 128 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -165,12 +165,16 @@ class llama_token_data_array(Structure):
165165
# int32_t n_gpu_layers; // number of layers to store in VRAM
166166
# int32_t main_gpu; // the GPU that is used for scratch and small tensors
167167
# float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
168+
169+
# // ref: https://github.com/ggerganov/llama.cpp/pull/2054
170+
# float rope_freq_base; // RoPE base frequency
171+
# float rope_freq_scale; // RoPE frequency scaling factor
172+
168173
# // called with a progress value between 0 and 1, pass NULL to disable
169174
# llama_progress_callback progress_callback;
170175
# // context pointer passed to the progress callback
171176
# void * progress_callback_user_data;
172177

173-
174178
# // Keep the booleans together to avoid misalignment during copy-by-value.
175179
# bool low_vram; // if true, reduce VRAM usage at the cost of performance
176180
# bool f16_kv; // use fp16 for KV cache
@@ -190,6 +194,8 @@ class llama_context_params(Structure):
190194
("n_gpu_layers", c_int32),
191195
("main_gpu", c_int32),
192196
("tensor_split", c_float * LLAMA_MAX_DEVICES.value),
197+
("rope_freq_base", c_float),
198+
("rope_freq_scale", c_float),
193199
("progress_callback", llama_progress_callback),
194200
("progress_callback_user_data", c_void_p),
195201
("low_vram", c_bool),
@@ -328,13 +334,23 @@ def llama_mlock_supported() -> bool:
328334
# // Initialize the llama + ggml backend
329335
# // If numa is true, use NUMA optimizations
330336
# // Call once at the start of the program
331-
# LLAMA_API void llama_init_backend(bool numa);
332-
def llama_init_backend(numa: c_bool):
333-
return _lib.llama_init_backend(numa)
337+
# LLAMA_API void llama_backend_init(bool numa);
338+
def llama_backend_init(numa: c_bool):
339+
return _lib.llama_backend_init(numa)
340+
341+
342+
_lib.llama_backend_init.argtypes = [c_bool]
343+
_lib.llama_backend_init.restype = None
344+
334345

346+
# // Call once at the end of the program - currently only used for MPI
347+
# LLAMA_API void llama_backend_free();
348+
def llama_backend_free():
349+
return _lib.llama_backend_free()
335350

336-
_lib.llama_init_backend.argtypes = [c_bool]
337-
_lib.llama_init_backend.restype = None
351+
352+
_lib.llama_backend_free.argtypes = []
353+
_lib.llama_backend_free.restype = None
338354

339355

340356
# LLAMA_API struct llama_model * llama_load_model_from_file(
@@ -648,6 +664,22 @@ def llama_tokenize(
648664
_lib.llama_tokenize.restype = c_int
649665

650666

667+
# LLAMA_API int llama_tokenize_with_model(
668+
# const struct llama_model * model,
669+
# const char * text,
670+
# llama_token * tokens,
671+
# int n_max_tokens,
672+
# bool add_bos);
673+
def llama_tokenize_with_model(
674+
model: llama_model_p,
675+
text: bytes,
676+
tokens, # type: Array[llama_token]
677+
n_max_tokens: c_int,
678+
add_bos: c_bool,
679+
) -> int:
680+
return _lib.llama_tokenize_with_model(model, text, tokens, n_max_tokens, add_bos)
681+
682+
651683
# LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
652684
def llama_n_vocab(ctx: llama_context_p) -> int:
653685
return _lib.llama_n_vocab(ctx)
@@ -675,6 +707,33 @@ def llama_n_embd(ctx: llama_context_p) -> int:
675707
_lib.llama_n_embd.restype = c_int
676708

677709

710+
# LLAMA_API int llama_n_vocab_from_model(const struct llama_model * model);
711+
def llama_n_vocab_from_model(model: llama_model_p) -> int:
712+
return _lib.llama_n_vocab_from_model(model)
713+
714+
715+
_lib.llama_n_vocab_from_model.argtypes = [llama_model_p]
716+
_lib.llama_n_vocab_from_model.restype = c_int
717+
718+
719+
# LLAMA_API int llama_n_ctx_from_model (const struct llama_model * model);
720+
def llama_n_ctx_from_model(model: llama_model_p) -> int:
721+
return _lib.llama_n_ctx_from_model(model)
722+
723+
724+
_lib.llama_n_ctx_from_model.argtypes = [llama_model_p]
725+
_lib.llama_n_ctx_from_model.restype = c_int
726+
727+
728+
# LLAMA_API int llama_n_embd_from_model (const struct llama_model * model);
729+
def llama_n_embd_from_model(model: llama_model_p) -> int:
730+
return _lib.llama_n_embd_from_model(model)
731+
732+
733+
_lib.llama_n_embd_from_model.argtypes = [llama_model_p]
734+
_lib.llama_n_embd_from_model.restype = c_int
735+
736+
678737
# // Get the vocabulary as output parameters.
679738
# // Returns number of results.
680739
# LLAMA_API int llama_get_vocab(
@@ -695,6 +754,20 @@ def llama_get_vocab(
695754
_lib.llama_get_vocab.restype = c_int
696755

697756

757+
# LLAMA_API int llama_get_vocab_from_model(
758+
# const struct llama_model * model,
759+
# const char * * strings,
760+
# float * scores,
761+
# int capacity);
762+
def llama_get_vocab_from_model(
763+
model: llama_model_p,
764+
strings, # type: Array[c_char_p] # type: ignore
765+
scores, # type: Array[c_float] # type: ignore
766+
capacity: c_int,
767+
) -> int:
768+
return _lib.llama_get_vocab_from_model(model, strings, scores, capacity)
769+
770+
698771
# Token logits obtained from the last call to llama_eval()
699772
# The logits for the last token are stored in the last row
700773
# Can be mutated in order to change the probabilities of the next token
@@ -724,15 +797,28 @@ def llama_get_embeddings(
724797
_lib.llama_get_embeddings.restype = c_float_p
725798

726799

727-
# Token Id -> String. Uses the vocabulary in the provided context
728-
# LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
800+
# // Token Id -> String. Uses the vocabulary in the provided context
801+
# LLAMA_API const char * llama_token_to_str(
802+
# const struct llama_context * ctx,
803+
# llama_token token);
729804
def llama_token_to_str(ctx: llama_context_p, token: llama_token) -> bytes:
730805
return _lib.llama_token_to_str(ctx, token)
731806

732807

733808
_lib.llama_token_to_str.argtypes = [llama_context_p, llama_token]
734809
_lib.llama_token_to_str.restype = c_char_p
735810

811+
812+
# LLAMA_API const char * llama_token_to_str_with_model(
813+
# const struct llama_model * model,
814+
# llama_token token);
815+
def llama_token_to_str_with_model(model: llama_model_p, token: llama_token) -> bytes:
816+
return _lib.llama_token_to_str_with_model(model, token)
817+
818+
819+
_lib.llama_token_to_str_with_model.argtypes = [llama_model_p, llama_token]
820+
_lib.llama_token_to_str_with_model.restype = c_char_p
821+
736822
# Special tokens
737823

738824

@@ -821,6 +907,39 @@ def llama_sample_frequency_and_presence_penalties(
821907
_lib.llama_sample_frequency_and_presence_penalties.restype = None
822908

823909

910+
# /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
911+
# /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
912+
# /// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
913+
# /// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
914+
# /// @params smooth_factor Smooth factor between guidance logits and original logits. 1.0f means only use guidance logits. 0.0f means only original logits.
915+
# LLAMA_API void llama_sample_classifier_free_guidance(
916+
# struct llama_context * ctx,
917+
# llama_token_data_array * candidates,
918+
# struct llama_context * guidance_ctx,
919+
# float scale,
920+
# float smooth_factor);
921+
def llama_sample_classifier_free_guidance(
922+
ctx: llama_context_p,
923+
candidates, # type: _Pointer[llama_token_data_array]
924+
guidance_ctx: llama_context_p,
925+
scale: c_float,
926+
smooth_factor: c_float,
927+
):
928+
return _lib.llama_sample_classifier_free_guidance(
929+
ctx, candidates, guidance_ctx, scale, smooth_factor
930+
)
931+
932+
933+
_lib.llama_sample_classifier_free_guidance.argtypes = [
934+
llama_context_p,
935+
llama_token_data_array_p,
936+
llama_context_p,
937+
c_float,
938+
c_float,
939+
]
940+
_lib.llama_sample_classifier_free_guidance.restype = None
941+
942+
824943
# @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
825944
# LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
826945
def llama_sample_softmax(
@@ -1065,5 +1184,5 @@ def llama_print_system_info() -> bytes:
10651184
_llama_initialized = False
10661185

10671186
if not _llama_initialized:
1068-
llama_init_backend(c_bool(False))
1187+
llama_backend_init(c_bool(False))
10691188
_llama_initialized = True

0 commit comments

Comments
 (0)