diff --git a/.github/workflows/build-docker.yaml b/.github/workflows/build-docker.yaml index 16b00a2f0..4b38dbacb 100644 --- a/.github/workflows/build-docker.yaml +++ b/.github/workflows/build-docker.yaml @@ -33,6 +33,7 @@ jobs: uses: docker/build-push-action@v4 with: context: . + file: "docker/simple/Dockerfile" push: true # push to registry pull: true # always fetch the latest base images platforms: linux/amd64,linux/arm64 # build for both amd64 and arm64 diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 56524e0db..a73e347b5 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -26,7 +26,7 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | - python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi sse-starlette httpx uvicorn + python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi sse-starlette httpx uvicorn pydantic-settings pip install . -v - name: Test with pytest run: | @@ -49,7 +49,7 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | - python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi sse-starlette httpx uvicorn + python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi sse-starlette httpx uvicorn pydantic-settings pip install . -v - name: Test with pytest run: | @@ -72,7 +72,7 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | - python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi sse-starlette httpx uvicorn + python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi sse-starlette httpx uvicorn pydantic-settings pip install . -v - name: Test with pytest run: | diff --git a/.gitignore b/.gitignore index 36ed7f7fd..3866fb251 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ _skbuild/ .envrc +.direnv models/ diff --git a/CHANGELOG.md b/CHANGELOG.md index c6cfaab28..c7723c529 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,26 +7,61 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.1.72] + +### Added + +- (llama.cpp) Update llama.cpp added custom_rope for extended context lengths + +## [0.1.71] + +### Added + +- (llama.cpp) Update llama.cpp + +### Fixed + +- (server) Fix several pydantic v2 migration bugs + +## [0.1.70] + +### Fixed + +- (Llama.create_completion) Revert change so that `max_tokens` is not truncated to `context_size` in `create_completion` +- (server) Fixed changed settings field names from pydantic v2 migration + +## [0.1.69] + +### Added + +- (server) Streaming requests can are now interrupted pre-maturely when a concurrent request is made. Can be controlled with the `interrupt_requests` setting. +- (server) Moved to fastapi v0.100.0 and pydantic v2 +- (docker) Added a new "simple" image that builds llama.cpp from source when started. + +## Fixed + +- (server) performance improvements by avoiding unnecessary memory allocations during sampling + ## [0.1.68] -## [Added] +### Added - (llama.cpp) Update llama.cpp ## [0.1.67] -## Fixed +### Fixed - Fix performance bug in Llama model by pre-allocating memory tokens and logits. - Fix bug in Llama model where the model was not free'd after use. ## [0.1.66] -## Added +### Added - (llama.cpp) New model API -## Fixed +### Fixed - Performance issue during eval caused by looped np.concatenate call - State pickling issue when saving cache to disk diff --git a/Makefile b/Makefile index 66d93f3a2..c359260b6 100644 --- a/Makefile +++ b/Makefile @@ -33,6 +33,15 @@ deploy.gh-docs: mkdocs build mkdocs gh-deploy +test: + python3 -m pytest + +docker: + docker build -t llama-cpp-python:latest -f docker/simple/Dockerfile . + +run-server: + uvicorn --factory llama.server:app --host ${HOST} --port ${PORT} + clean: - cd vendor/llama.cpp && make clean - cd vendor/llama.cpp && rm libllama.so @@ -53,4 +62,5 @@ clean: build.sdist \ deploy.pypi \ deploy.gh-docs \ + docker \ clean \ No newline at end of file diff --git a/README.md b/README.md index fb652a925..1f3dcb5ab 100644 --- a/README.md +++ b/README.md @@ -105,6 +105,15 @@ Below is a short example demonstrating how to use the high-level API to generate } ``` +### Adjusting the Context Window +The context window of the Llama models determines the maximum number of tokens that can be processed at once. By default, this is set to 512 tokens, but can be adjusted based on your requirements. + +For instance, if you want to work with larger contexts, you can expand the context window by setting the n_ctx parameter when initializing the Llama object: + +```python +llm = Llama(model_path="./models/7B/ggml-model.bin", n_ctx=2048) +``` + ## Web Server `llama-cpp-python` offers a web server which aims to act as a drop-in replacement for the OpenAI API. @@ -126,6 +135,7 @@ A Docker image is available on [GHCR](https://ghcr.io/abetlen/llama-cpp-python). ```bash docker run --rm -it -p 8000:8000 -v /path/to/models:/models -e MODEL=/models/ggml-model-name.bin ghcr.io/abetlen/llama-cpp-python:latest ``` +[Docker on termux (requires root)](https://gist.github.com/FreddieOliveira/efe850df7ff3951cb62d74bd770dce27) is currently the only known way to run this on phones, see [termux support issue](https://github.com/abetlen/llama-cpp-python/issues/389) ## Low-level API diff --git a/docker/cuda_simple/Dockerfile b/docker/cuda_simple/Dockerfile index 24906d53a..e4a2f07e2 100644 --- a/docker/cuda_simple/Dockerfile +++ b/docker/cuda_simple/Dockerfile @@ -8,7 +8,7 @@ COPY . . # Install the package RUN apt update && apt install -y python3 python3-pip -RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette +RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings RUN LLAMA_CUBLAS=1 pip install llama-cpp-python diff --git a/docker/open_llama/Dockerfile b/docker/open_llama/Dockerfile index f0ef5f721..7788f33de 100644 --- a/docker/open_llama/Dockerfile +++ b/docker/open_llama/Dockerfile @@ -14,7 +14,7 @@ RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-reco ninja-build \ build-essential -RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette +RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings # Perform the conditional installations based on the image RUN echo "Image: ${IMAGE}" && \ diff --git a/docker/openblas_simple/Dockerfile b/docker/openblas_simple/Dockerfile index 1a95caeda..8231bdb96 100644 --- a/docker/openblas_simple/Dockerfile +++ b/docker/openblas_simple/Dockerfile @@ -7,7 +7,7 @@ COPY . . # Install the package RUN apt update && apt install -y libopenblas-dev ninja-build build-essential -RUN python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette +RUN python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings RUN LLAMA_OPENBLAS=1 pip install llama_cpp_python --verbose diff --git a/docker/simple/Dockerfile b/docker/simple/Dockerfile new file mode 100644 index 000000000..507b2ba46 --- /dev/null +++ b/docker/simple/Dockerfile @@ -0,0 +1,34 @@ +# Define the image argument and provide a default value +ARG IMAGE=python:3-slim-bullseye + +# Use the image as specified +FROM ${IMAGE} + +# Re-declare the ARG after FROM +ARG IMAGE + +# Update and upgrade the existing packages +RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-recommends \ + python3 \ + python3-pip \ + ninja-build \ + libopenblas-dev \ + build-essential + +RUN mkdir /app +WORKDIR /app +COPY . /app + +RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings + +RUN make build && make clean + +# Set environment variable for the host +ENV HOST=0.0.0.0 +ENV PORT=8000 + +# Expose a port for the server +EXPOSE 8000 + +# Run the server start script +CMD ["/bin/sh", "/app/docker/simple/run.sh"] diff --git a/docker/simple/run.sh b/docker/simple/run.sh new file mode 100644 index 000000000..c85e73d2b --- /dev/null +++ b/docker/simple/run.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +make build +uvicorn --factory llama_cpp.server.app:create_app --host $HOST --port $PORT diff --git a/docs/install/macos.md b/docs/install/macos.md index 600469615..3330396e3 100644 --- a/docs/install/macos.md +++ b/docs/install/macos.md @@ -26,19 +26,19 @@ conda create -n llama python=3.9.16 conda activate llama ``` -**(4) Install the LATEST llama-cpp-python.. which, as of just today, happily supports MacOS Metal GPU** +**(4) Install the LATEST llama-cpp-python...which happily supports MacOS Metal GPU as of version 0.1.62** *(you needed xcode installed in order pip to build/compile the C++ code)* ``` pip uninstall llama-cpp-python -y CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install -U llama-cpp-python --no-cache-dir pip install 'llama-cpp-python[server]' -# you should now have llama-cpp-python v0.1.62 installed -llama-cpp-python         0.1.62      +# you should now have llama-cpp-python v0.1.62 or higher installed +llama-cpp-python         0.1.68 ``` -**(4) Download a v3 ggml model** +**(5) Download a v3 ggml model** - **ggmlv3** - file name ends with **q4_0.bin** - indicating it is 4bit quantized, with quantisation method 0 diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 688b2a74f..ed27476e7 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -19,6 +19,7 @@ from collections import deque, OrderedDict import diskcache +import ctypes from . import llama_cpp from .llama_types import * @@ -26,7 +27,6 @@ import numpy as np import numpy.typing as npt - class BaseLlamaCache(ABC): """Base cache class for a llama.cpp model.""" @@ -220,6 +220,9 @@ def __init__( lora_base: Optional[str] = None, lora_path: Optional[str] = None, low_vram: bool = False, + tensor_split: Optional[List[float]] = None, + rope_freq_base: float = 80000.0, + rope_freq_scale: float = 0.5, verbose: bool = True, ): """Load a llama.cpp model from `model_path`. @@ -240,6 +243,9 @@ def __init__( last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque. lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model. lora_path: Path to a LoRA file to apply to the model. + tensor_split: List of floats to split the model across multiple GPUs. If None, the model is not split. + rope_freq_base: Base frequency for rope sampling. + rope_freq_scale: Scale factor for rope sampling. verbose: Print verbose output to stderr. Raises: @@ -248,6 +254,7 @@ def __init__( Returns: A Llama instance. """ + self.verbose = verbose self.model_path = model_path @@ -263,6 +270,18 @@ def __init__( self.params.embedding = embedding self.params.low_vram = low_vram + self.tensor_split = tensor_split + self._c_tensor_split = None + + if self.tensor_split is not None: + #Type conversion and expand the list to the length of LLAMA_MAX_DEVICES + FloatArray = ctypes.c_float * llama_cpp.LLAMA_MAX_DEVICES.value + self._c_tensor_split = FloatArray(*tensor_split) # keep a reference to the array so it is not gc'd + self.params.tensor_split = self._c_tensor_split + + self.params.rope_freq_base = rope_freq_base + self.params.rope_freq_scale = rope_freq_scale + self.last_n_tokens_size = last_n_tokens_size self.n_batch = min(n_ctx, n_batch) @@ -324,6 +343,8 @@ def __init__( self._candidates = candidates self._token_nl = Llama.token_nl() self._token_eos = Llama.token_eos() + self._candidates_data_id = np.arange(self._n_vocab, dtype=np.intc) # type: ignore + self._candidates_data_p = np.zeros(self._n_vocab, dtype=np.single) self.n_tokens = 0 self.input_ids: npt.NDArray[np.intc] = np.ndarray((n_ctx,), dtype=np.intc) @@ -443,8 +464,12 @@ def eval(self, tokens: Sequence[int]): # Save logits rows = n_tokens if self.params.logits_all else 1 cols = self._n_vocab - offset = 0 if self.params.logits_all else n_tokens - 1 # NOTE: Only save the last token logits if logits_all is False - self.scores[self.n_tokens + offset: self.n_tokens + n_tokens, :].reshape(-1)[:] = llama_cpp.llama_get_logits(self.ctx)[:rows * cols] + offset = ( + 0 if self.params.logits_all else n_tokens - 1 + ) # NOTE: Only save the last token logits if logits_all is False + self.scores[self.n_tokens + offset : self.n_tokens + n_tokens, :].reshape( + -1 + )[:] = llama_cpp.llama_get_logits(self.ctx)[: rows * cols] # Update n_tokens self.n_tokens += n_tokens @@ -487,9 +512,9 @@ def _sample( nl_logit = logits[self._token_nl] candidates = self._candidates candidates_data = self._candidates_data - candidates_data["id"] = np.arange(n_vocab, dtype=np.intc) # type: ignore - candidates_data["logit"] = logits - candidates_data["p"] = np.zeros(n_vocab, dtype=np.single) + candidates_data["id"][:] = self._candidates_data_id # type: ignore + candidates_data["logit"][:] = logits + candidates_data["p"][:] = self._candidates_data_p # type: ignore candidates.data = candidates_data.ctypes.data_as(llama_cpp.llama_token_data_p) candidates.sorted = llama_cpp.c_bool(False) candidates.size = llama_cpp.c_size_t(n_vocab) @@ -535,7 +560,7 @@ def _sample( mirostat_mu = llama_cpp.c_float(2.0 * mirostat_tau.value) llama_cpp.llama_sample_temperature( ctx=self.ctx, - candidates=llama_cpp.ctypes.pointer(candidates), + candidates=llama_cpp.ctypes.byref(candidates), # type: ignore temp=temp, ) return llama_cpp.llama_sample_token_mirostat_v2( @@ -818,11 +843,15 @@ def _create_completion( if self.verbose: llama_cpp.llama_reset_timings(self.ctx) - if len(prompt_tokens) > self._n_ctx: + if len(prompt_tokens) >= llama_cpp.llama_n_ctx(self.ctx): raise ValueError( - f"Requested tokens ({len(prompt_tokens)}) exceed context window of {self._n_ctx}" + f"Requested tokens exceed context window of {llama_cpp.llama_n_ctx(self.ctx)}" ) + if max_tokens <= 0: + # Unlimited, depending on n_ctx. + max_tokens = llama_cpp.llama_n_ctx(self.ctx) - len(prompt_tokens) + # Truncate max_tokens if requested tokens would exceed the context window max_tokens = ( max_tokens @@ -958,7 +987,7 @@ def _create_completion( ) ], "text_offset": [text_offset], - "token_logprobs": [sorted_logprobs[int(token)][0]], + "token_logprobs": [current_logprobs[int(token)]], "top_logprobs": [top_logprob], } returned_tokens += 1 @@ -1033,7 +1062,7 @@ def _create_completion( self.detokenize([token]).decode("utf-8", errors="ignore") ], "text_offset": [text_offset], - "token_logprobs": [sorted_logprobs[int(token)][0]], + "token_logprobs": [current_logprobs[int(token)]], "top_logprobs": [top_logprob], } @@ -1054,6 +1083,20 @@ def _create_completion( ].decode("utf-8", errors="ignore"), "index": 0, "logprobs": logprobs_or_none, + "finish_reason": None, + } + ], + } + yield { + "id": completion_id, + "object": "text_completion", + "created": created, + "model": model_name, + "choices": [ + { + "text": "", + "index": 0, + "logprobs": None, "finish_reason": finish_reason, } ], @@ -1072,9 +1115,21 @@ def _create_completion( ), "index": 0, "logprobs": logprobs_or_none, - "finish_reason": finish_reason - if returned_tokens == len(completion_tokens) - else None, + "finish_reason": None, + } + ], + } + yield { + "id": completion_id, + "object": "text_completion", + "created": created, + "model": model_name, + "choices": [ + { + "text": "", + "index": 0, + "logprobs": None, + "finish_reason": finish_reason, } ], } @@ -1131,7 +1186,7 @@ def _create_completion( zip(logprobs_token, range(len(logprobs_token))), reverse=True ) ) - token_logprobs.append(sorted_logprobs[int(token)][0]) + token_logprobs.append(logprobs_token[int(token)]) top_logprob: Optional[Dict[str, float]] = { self.detokenize([i]).decode("utf-8", errors="ignore"): logprob for logprob, i in sorted_logprobs[:logprobs] @@ -1199,7 +1254,7 @@ def create_completion( Args: prompt: The prompt to generate text from. suffix: A suffix to append to the generated text. If None, no suffix is appended. - max_tokens: The maximum number of tokens to generate. + max_tokens: The maximum number of tokens to generate. If max_tokens <= 0, the maximum number of tokens to generate is unlimited and depends on n_ctx. temperature: The temperature to use for sampling. top_p: The top-p value to use for sampling. logprobs: The number of logprobs to return. If None, no logprobs are returned. @@ -1272,7 +1327,7 @@ def __call__( Args: prompt: The prompt to generate text from. suffix: A suffix to append to the generated text. If None, no suffix is appended. - max_tokens: The maximum number of tokens to generate. + max_tokens: The maximum number of tokens to generate. If max_tokens <= 0, the maximum number of tokens to generate is unlimited and depends on n_ctx. temperature: The temperature to use for sampling. top_p: The top-p value to use for sampling. logprobs: The number of logprobs to return. If None, no logprobs are returned. @@ -1364,7 +1419,9 @@ def _convert_text_completion_chunks_to_chat( "index": 0, "delta": { "content": chunk["choices"][0]["text"], - }, + } + if chunk["choices"][0]["finish_reason"] is None + else {}, "finish_reason": chunk["choices"][0]["finish_reason"], } ], @@ -1398,7 +1455,7 @@ def create_chat_completion( top_k: The top-k value to use for sampling. stream: Whether to stream the results. stop: A list of strings to stop generation when encountered. - max_tokens: The maximum number of tokens to generate. + max_tokens: The maximum number of tokens to generate. If max_tokens <= 0, the maximum number of tokens to generate is unlimited and depends on n_ctx. repeat_penalty: The penalty to apply to repeated tokens. Returns: @@ -1465,6 +1522,7 @@ def __getstate__(self): n_threads=self.n_threads, lora_base=self.lora_base, lora_path=self.lora_path, + tensor_split=self.tensor_split, ### DEPRECATED ### n_parts=self.n_parts, ### DEPRECATED ### @@ -1489,6 +1547,7 @@ def __setstate__(self, state): last_n_tokens_size=state["last_n_tokens_size"], lora_base=state["lora_base"], lora_path=state["lora_path"], + tensor_split=state["tensor_split"], verbose=state["verbose"], ) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index c68fb18d1..aef4f65c7 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -2,6 +2,7 @@ import os import ctypes from ctypes import ( + c_double, c_int, c_float, c_char_p, @@ -164,6 +165,11 @@ class llama_token_data_array(Structure): # int32_t n_gpu_layers; // number of layers to store in VRAM # int32_t main_gpu; // the GPU that is used for scratch and small tensors # float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs + +# // ref: https://github.com/ggerganov/llama.cpp/pull/2054 +# float rope_freq_base; // RoPE base frequency +# float rope_freq_scale; // RoPE frequency scaling factor + # // called with a progress value between 0 and 1, pass NULL to disable # llama_progress_callback progress_callback; # // context pointer passed to the progress callback @@ -186,6 +192,8 @@ class llama_context_params(Structure): ("n_gpu_layers", c_int32), ("main_gpu", c_int32), ("tensor_split", c_float * LLAMA_MAX_DEVICES.value), + ("rope_freq_base", c_float), + ("rope_freq_scale", c_float), ("progress_callback", llama_progress_callback), ("progress_callback_user_data", c_void_p), ("low_vram", c_bool), @@ -256,6 +264,34 @@ class llama_model_quantize_params(Structure): ] +# // performance timing information +# struct llama_timings { +# double t_start_ms; +# double t_end_ms; +# double t_load_ms; +# double t_sample_ms; +# double t_p_eval_ms; +# double t_eval_ms; + + +# int32_t n_sample; +# int32_t n_p_eval; +# int32_t n_eval; +# }; +class llama_timings(Structure): + _fields_ = [ + ("t_start_ms", c_double), + ("t_end_ms", c_double), + ("t_load_ms", c_double), + ("t_sample_ms", c_double), + ("t_p_eval_ms", c_double), + ("t_eval_ms", c_double), + ("n_sample", c_int32), + ("n_p_eval", c_int32), + ("n_eval", c_int32), + ] + + # LLAMA_API struct llama_context_params llama_context_default_params(); def llama_context_default_params() -> llama_context_params: return _lib.llama_context_default_params() @@ -296,13 +332,23 @@ def llama_mlock_supported() -> bool: # // Initialize the llama + ggml backend # // If numa is true, use NUMA optimizations # // Call once at the start of the program -# LLAMA_API void llama_init_backend(bool numa); -def llama_init_backend(numa: c_bool): - return _lib.llama_init_backend(numa) +# LLAMA_API void llama_backend_init(bool numa); +def llama_backend_init(numa: c_bool): + return _lib.llama_backend_init(numa) -_lib.llama_init_backend.argtypes = [c_bool] -_lib.llama_init_backend.restype = None +_lib.llama_backend_init.argtypes = [c_bool] +_lib.llama_backend_init.restype = None + + +# // Call once at the end of the program - currently only used for MPI +# LLAMA_API void llama_backend_free(); +def llama_backend_free(): + return _lib.llama_backend_free() + + +_lib.llama_backend_free.argtypes = [] +_lib.llama_backend_free.restype = None # LLAMA_API struct llama_model * llama_load_model_from_file( @@ -616,6 +662,22 @@ def llama_tokenize( _lib.llama_tokenize.restype = c_int +# LLAMA_API int llama_tokenize_with_model( +# const struct llama_model * model, +# const char * text, +# llama_token * tokens, +# int n_max_tokens, +# bool add_bos); +def llama_tokenize_with_model( + model: llama_model_p, + text: bytes, + tokens, # type: Array[llama_token] + n_max_tokens: c_int, + add_bos: c_bool, +) -> int: + return _lib.llama_tokenize_with_model(model, text, tokens, n_max_tokens, add_bos) + + # LLAMA_API int llama_n_vocab(const struct llama_context * ctx); def llama_n_vocab(ctx: llama_context_p) -> int: return _lib.llama_n_vocab(ctx) @@ -643,6 +705,33 @@ def llama_n_embd(ctx: llama_context_p) -> int: _lib.llama_n_embd.restype = c_int +# LLAMA_API int llama_n_vocab_from_model(const struct llama_model * model); +def llama_n_vocab_from_model(model: llama_model_p) -> int: + return _lib.llama_n_vocab_from_model(model) + + +_lib.llama_n_vocab_from_model.argtypes = [llama_model_p] +_lib.llama_n_vocab_from_model.restype = c_int + + +# LLAMA_API int llama_n_ctx_from_model (const struct llama_model * model); +def llama_n_ctx_from_model(model: llama_model_p) -> int: + return _lib.llama_n_ctx_from_model(model) + + +_lib.llama_n_ctx_from_model.argtypes = [llama_model_p] +_lib.llama_n_ctx_from_model.restype = c_int + + +# LLAMA_API int llama_n_embd_from_model (const struct llama_model * model); +def llama_n_embd_from_model(model: llama_model_p) -> int: + return _lib.llama_n_embd_from_model(model) + + +_lib.llama_n_embd_from_model.argtypes = [llama_model_p] +_lib.llama_n_embd_from_model.restype = c_int + + # // Get the vocabulary as output parameters. # // Returns number of results. # LLAMA_API int llama_get_vocab( @@ -663,6 +752,20 @@ def llama_get_vocab( _lib.llama_get_vocab.restype = c_int +# LLAMA_API int llama_get_vocab_from_model( +# const struct llama_model * model, +# const char * * strings, +# float * scores, +# int capacity); +def llama_get_vocab_from_model( + model: llama_model_p, + strings, # type: Array[c_char_p] # type: ignore + scores, # type: Array[c_float] # type: ignore + capacity: c_int, +) -> int: + return _lib.llama_get_vocab_from_model(model, strings, scores, capacity) + + # Token logits obtained from the last call to llama_eval() # The logits for the last token are stored in the last row # Can be mutated in order to change the probabilities of the next token @@ -692,8 +795,10 @@ def llama_get_embeddings( _lib.llama_get_embeddings.restype = c_float_p -# Token Id -> String. Uses the vocabulary in the provided context -# LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token); +# // Token Id -> String. Uses the vocabulary in the provided context +# LLAMA_API const char * llama_token_to_str( +# const struct llama_context * ctx, +# llama_token token); def llama_token_to_str(ctx: llama_context_p, token: llama_token) -> bytes: return _lib.llama_token_to_str(ctx, token) @@ -701,6 +806,17 @@ def llama_token_to_str(ctx: llama_context_p, token: llama_token) -> bytes: _lib.llama_token_to_str.argtypes = [llama_context_p, llama_token] _lib.llama_token_to_str.restype = c_char_p + +# LLAMA_API const char * llama_token_to_str_with_model( +# const struct llama_model * model, +# llama_token token); +def llama_token_to_str_with_model(model: llama_model_p, token: llama_token) -> bytes: + return _lib.llama_token_to_str_with_model(model, token) + + +_lib.llama_token_to_str_with_model.argtypes = [llama_model_p, llama_token] +_lib.llama_token_to_str_with_model.restype = c_char_p + # Special tokens @@ -789,6 +905,39 @@ def llama_sample_frequency_and_presence_penalties( _lib.llama_sample_frequency_and_presence_penalties.restype = None +# /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806 +# /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted. +# /// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context. +# /// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance. +# /// @params smooth_factor Smooth factor between guidance logits and original logits. 1.0f means only use guidance logits. 0.0f means only original logits. +# LLAMA_API void llama_sample_classifier_free_guidance( +# struct llama_context * ctx, +# llama_token_data_array * candidates, +# struct llama_context * guidance_ctx, +# float scale, +# float smooth_factor); +def llama_sample_classifier_free_guidance( + ctx: llama_context_p, + candidates, # type: _Pointer[llama_token_data_array] + guidance_ctx: llama_context_p, + scale: c_float, + smooth_factor: c_float, +): + return _lib.llama_sample_classifier_free_guidance( + ctx, candidates, guidance_ctx, scale, smooth_factor + ) + + +_lib.llama_sample_classifier_free_guidance.argtypes = [ + llama_context_p, + llama_token_data_array_p, + llama_context_p, + c_float, + c_float, +] +_lib.llama_sample_classifier_free_guidance.restype = None + + # @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits. # LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates); def llama_sample_softmax( @@ -991,6 +1140,15 @@ def llama_sample_token( # Performance information +# LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx); +def llama_get_timings(ctx: llama_context_p) -> llama_timings: + return _lib.llama_get_timings(ctx) + + +_lib.llama_get_timings.argtypes = [llama_context_p] +_lib.llama_get_timings.restype = llama_timings + + # LLAMA_API void llama_print_timings(struct llama_context * ctx); def llama_print_timings(ctx: llama_context_p): _lib.llama_print_timings(ctx) @@ -1024,5 +1182,5 @@ def llama_print_system_info() -> bytes: _llama_initialized = False if not _llama_initialized: - llama_init_backend(c_bool(False)) + llama_backend_init(c_bool(False)) _llama_initialized = True diff --git a/llama_cpp/llama_types.py b/llama_cpp/llama_types.py index 7729ced5a..6ba8023bd 100644 --- a/llama_cpp/llama_types.py +++ b/llama_cpp/llama_types.py @@ -1,4 +1,4 @@ -from typing import List, Optional, Dict +from typing import Any, List, Optional, Dict, Union from typing_extensions import TypedDict, NotRequired, Literal @@ -77,6 +77,8 @@ class ChatCompletion(TypedDict): choices: List[ChatCompletionChoice] usage: CompletionUsage +class ChatCompletionChunkDeltaEmpty(TypedDict): + pass class ChatCompletionChunkDelta(TypedDict): role: NotRequired[Literal["assistant"]] @@ -85,7 +87,7 @@ class ChatCompletionChunkDelta(TypedDict): class ChatCompletionChunkChoice(TypedDict): index: int - delta: ChatCompletionChunkDelta + delta: Union[ChatCompletionChunkDelta, ChatCompletionChunkDeltaEmpty] finish_reason: Optional[str] diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py index 748a2af33..995dd4449 100644 --- a/llama_cpp/server/__main__.py +++ b/llama_cpp/server/__main__.py @@ -3,7 +3,7 @@ To run this example: ```bash -pip install fastapi uvicorn sse-starlette +pip install fastapi uvicorn sse-starlette pydantic-settings export MODEL=../models/7B/... ``` @@ -30,14 +30,14 @@ if __name__ == "__main__": parser = argparse.ArgumentParser() - for name, field in Settings.__fields__.items(): - description = field.field_info.description + for name, field in Settings.model_fields.items(): + description = field.description if field.default is not None and description is not None: description += f" (default: {field.default})" parser.add_argument( f"--{name}", dest=name, - type=field.type_, + type=field.annotation if field.annotation is not None else str, help=description, ) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index ef319c7e0..eaa6f44a9 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -12,7 +12,8 @@ from starlette.concurrency import run_in_threadpool, iterate_in_threadpool from fastapi import Depends, FastAPI, APIRouter, Request from fastapi.middleware.cors import CORSMiddleware -from pydantic import BaseModel, BaseSettings, Field, create_model_from_typeddict +from pydantic import BaseModel, Field +from pydantic_settings import BaseSettings from sse_starlette.sse import EventSourceResponse @@ -30,6 +31,10 @@ class Settings(BaseSettings): ge=0, description="The number of layers to put on the GPU. The rest will be on the CPU.", ) + tensor_split: Optional[List[float]] = Field( + default=None, + description="Split layers across multiple GPUs in proportion.", + ) seed: int = Field( default=1337, description="Random seed. -1 for random." ) @@ -79,11 +84,11 @@ class Settings(BaseSettings): verbose: bool = Field( default=True, description="Whether to print debug information." ) - host: str = Field( - default="localhost", description="Listen address" - ) - port: int = Field( - default=8000, description="Listen port" + host: str = Field(default="localhost", description="Listen address") + port: int = Field(default=8000, description="Listen port") + interrupt_requests: bool = Field( + default=True, + description="Whether to interrupt requests when a new request is received.", ) @@ -112,6 +117,7 @@ def create_app(settings: Optional[Settings] = None): llama = llama_cpp.Llama( model_path=settings.model, n_gpu_layers=settings.n_gpu_layers, + tensor_split=settings.tensor_split, seed=settings.seed, f16_kv=settings.f16_kv, use_mlock=settings.use_mlock, @@ -146,19 +152,34 @@ def set_settings(_settings: Settings): return app -llama_lock = Lock() +llama_outer_lock = Lock() +llama_inner_lock = Lock() def get_llama(): - with llama_lock: - yield llama + # NOTE: This double lock allows the currently streaming llama model to + # check if any other requests are pending in the same thread and cancel + # the stream if so. + llama_outer_lock.acquire() + release_outer_lock = True + try: + llama_inner_lock.acquire() + try: + llama_outer_lock.release() + release_outer_lock = False + yield llama + finally: + llama_inner_lock.release() + finally: + if release_outer_lock: + llama_outer_lock.release() def get_settings(): yield settings -model_field = Field(description="The model to use for generating completions.") +model_field = Field(description="The model to use for generating completions.", default=None) max_tokens_field = Field( default=16, ge=1, le=2048, description="The maximum number of tokens to generate." @@ -222,21 +243,18 @@ def get_settings(): default=0, ge=0, le=2, - description="Enable Mirostat constant-perplexity algorithm of the specified version (1 or 2; 0 = disabled)" + description="Enable Mirostat constant-perplexity algorithm of the specified version (1 or 2; 0 = disabled)", ) mirostat_tau_field = Field( default=5.0, ge=0.0, le=10.0, - description="Mirostat target entropy, i.e. the target perplexity - lower values produce focused and coherent text, larger values produce more diverse and less coherent text" + description="Mirostat target entropy, i.e. the target perplexity - lower values produce focused and coherent text, larger values produce more diverse and less coherent text", ) mirostat_eta_field = Field( - default=0.1, - ge=0.001, - le=1.0, - description="Mirostat learning rate" + default=0.1, ge=0.001, le=1.0, description="Mirostat learning rate" ) @@ -274,23 +292,23 @@ class CreateCompletionRequest(BaseModel): model: Optional[str] = model_field n: Optional[int] = 1 best_of: Optional[int] = 1 - user: Optional[str] = Field(None) + user: Optional[str] = Field(default=None) # llama.cpp specific parameters top_k: int = top_k_field repeat_penalty: float = repeat_penalty_field logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None) - class Config: - schema_extra = { - "example": { - "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n", - "stop": ["\n", "###"], - } + model_config = { + "json_schema_extra": { + "examples": [ + { + "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n", + "stop": ["\n", "###"], + } + ] } - - -CreateCompletionResponse = create_model_from_typeddict(llama_cpp.Completion) + } def make_logit_bias_processor( @@ -309,7 +327,7 @@ def make_logit_bias_processor( elif logit_bias_type == "tokens": for token, score in logit_bias.items(): - token = token.encode('utf-8') + token = token.encode("utf-8") for input_id in llama.tokenize(token, add_bos=False): to_bias[input_id] = score @@ -328,13 +346,12 @@ def logit_bias_processor( @router.post( "/v1/completions", - response_model=CreateCompletionResponse, ) async def create_completion( request: Request, body: CreateCompletionRequest, llama: llama_cpp.Llama = Depends(get_llama), -): +) -> llama_cpp.Completion: if isinstance(body.prompt, list): assert len(body.prompt) <= 1 body.prompt = body.prompt[0] if len(body.prompt) > 0 else "" @@ -346,7 +363,7 @@ async def create_completion( "logit_bias_type", "user", } - kwargs = body.dict(exclude=exclude) + kwargs = body.model_dump(exclude=exclude) if body.logit_bias is not None: kwargs['logits_processor'] = llama_cpp.LogitsProcessorList([ @@ -364,6 +381,9 @@ async def event_publisher(inner_send_chan: MemoryObjectSendStream): await inner_send_chan.send(dict(data=json.dumps(chunk))) if await request.is_disconnected(): raise anyio.get_cancelled_exc_class()() + if settings.interrupt_requests and llama_outer_lock.locked(): + await inner_send_chan.send(dict(data="[DONE]")) + raise anyio.get_cancelled_exc_class()() await inner_send_chan.send(dict(data="[DONE]")) except anyio.get_cancelled_exc_class() as e: print("disconnected") @@ -371,12 +391,11 @@ async def event_publisher(inner_send_chan: MemoryObjectSendStream): print( f"Disconnected from client (via refresh/close) {request.client}" ) - await inner_send_chan.send(dict(closing=True)) raise e return EventSourceResponse( recv_chan, data_sender_callable=partial(event_publisher, send_chan) - ) + ) # type: ignore else: completion: llama_cpp.Completion = await run_in_threadpool(llama, **kwargs) # type: ignore return completion @@ -385,28 +404,27 @@ async def event_publisher(inner_send_chan: MemoryObjectSendStream): class CreateEmbeddingRequest(BaseModel): model: Optional[str] = model_field input: Union[str, List[str]] = Field(description="The input to embed.") - user: Optional[str] - - class Config: - schema_extra = { - "example": { - "input": "The food was delicious and the waiter...", - } + user: Optional[str] = Field(default=None) + + model_config = { + "json_schema_extra": { + "examples": [ + { + "input": "The food was delicious and the waiter...", + } + ] } - - -CreateEmbeddingResponse = create_model_from_typeddict(llama_cpp.Embedding) + } @router.post( "/v1/embeddings", - response_model=CreateEmbeddingResponse, ) async def create_embedding( request: CreateEmbeddingRequest, llama: llama_cpp.Llama = Depends(get_llama) ): return await run_in_threadpool( - llama.create_embedding, **request.dict(exclude={"user"}) + llama.create_embedding, **request.model_dump(exclude={"user"}) ) @@ -443,40 +461,40 @@ class CreateChatCompletionRequest(BaseModel): repeat_penalty: float = repeat_penalty_field logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None) - class Config: - schema_extra = { - "example": { - "messages": [ - ChatCompletionRequestMessage( - role="system", content="You are a helpful assistant." - ), - ChatCompletionRequestMessage( - role="user", content="What is the capital of France?" - ), - ] - } + model_config = { + "json_schema_extra": { + "examples": [ + { + "messages": [ + ChatCompletionRequestMessage( + role="system", content="You are a helpful assistant." + ).model_dump(), + ChatCompletionRequestMessage( + role="user", content="What is the capital of France?" + ).model_dump(), + ] + } + ] } - - -CreateChatCompletionResponse = create_model_from_typeddict(llama_cpp.ChatCompletion) + } @router.post( "/v1/chat/completions", - response_model=CreateChatCompletionResponse, ) async def create_chat_completion( request: Request, body: CreateChatCompletionRequest, llama: llama_cpp.Llama = Depends(get_llama), -) -> Union[llama_cpp.ChatCompletion, EventSourceResponse]: + settings: Settings = Depends(get_settings), +) -> llama_cpp.ChatCompletion: exclude = { "n", "logit_bias", "logit_bias_type", "user", } - kwargs = body.dict(exclude=exclude) + kwargs = body.model_dump(exclude=exclude) if body.logit_bias is not None: kwargs['logits_processor'] = llama_cpp.LogitsProcessorList([ @@ -494,6 +512,9 @@ async def event_publisher(inner_send_chan: MemoryObjectSendStream): await inner_send_chan.send(dict(data=json.dumps(chat_chunk))) if await request.is_disconnected(): raise anyio.get_cancelled_exc_class()() + if settings.interrupt_requests and llama_outer_lock.locked(): + await inner_send_chan.send(dict(data="[DONE]")) + raise anyio.get_cancelled_exc_class()() await inner_send_chan.send(dict(data="[DONE]")) except anyio.get_cancelled_exc_class() as e: print("disconnected") @@ -501,13 +522,12 @@ async def event_publisher(inner_send_chan: MemoryObjectSendStream): print( f"Disconnected from client (via refresh/close) {request.client}" ) - await inner_send_chan.send(dict(closing=True)) raise e return EventSourceResponse( recv_chan, data_sender_callable=partial(event_publisher, send_chan), - ) + ) # type: ignore else: completion: llama_cpp.ChatCompletion = await run_in_threadpool( llama.create_chat_completion, **kwargs # type: ignore @@ -527,14 +547,11 @@ class ModelList(TypedDict): data: List[ModelData] -GetModelResponse = create_model_from_typeddict(ModelList) - - -@router.get("/v1/models", response_model=GetModelResponse) +@router.get("/v1/models") async def get_models( settings: Settings = Depends(get_settings), - llama: llama_cpp.Llama = Depends(get_llama), ) -> ModelList: + assert llama is not None return { "object": "list", "data": [ diff --git a/pyproject.toml b/pyproject.toml index b3ad3b411..7839a869a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "llama_cpp_python" -version = "0.1.68" +version = "0.1.72" description = "Python bindings for the llama.cpp library" authors = ["Andrei Betlen "] license = "MIT" @@ -32,7 +32,7 @@ httpx = "^0.24.1" scikit-build = "0.17.6" [tool.poetry.extras] -server = ["uvicorn", "fastapi", "sse-starlette"] +server = ["uvicorn>=0.22.0", "fastapi>=0.100.0", "pydantic-settings>=2.0.1", "sse-starlette>=1.6.1"] [build-system] requires = [ diff --git a/setup.py b/setup.py index 32101eb07..9b4de9785 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ description="A Python wrapper for llama.cpp", long_description=long_description, long_description_content_type="text/markdown", - version="0.1.68", + version="0.1.72", author="Andrei Betlen", author_email="abetlen@gmail.com", license="MIT", @@ -18,7 +18,7 @@ packages=["llama_cpp", "llama_cpp.server"], install_requires=["typing-extensions>=4.5.0", "numpy>=1.20.0", "diskcache>=5.6.1"], extras_require={ - "server": ["uvicorn>=0.21.1", "fastapi>=0.95.0", "sse-starlette>=1.3.3"], + "server": ["uvicorn>=0.22.0", "fastapi>=0.100.0", "pydantic-settings>=2.0.1", "sse-starlette>=1.6.1"], }, python_requires=">=3.7", classifiers=[ diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 7f0e9a775..6e7cca404 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 7f0e9a775ecc4c6ade271c217f63d6dc93e79eaa +Subproject commit 6e7cca404748dd4b1a3affd0d1296e37f4ac0a6f