From 90e102115419453d75ba59c1d35ef23b8bbcea6e Mon Sep 17 00:00:00 2001 From: jm12138 <2286040843@qq.com> Date: Mon, 10 Apr 2023 15:56:05 +0000 Subject: [PATCH 01/48] Add unlimited max_tokens --- llama_cpp/llama.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 2d76ec402..880e42d66 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -317,7 +317,15 @@ def _create_completion( if self.verbose: llama_cpp.llama_reset_timings(self.ctx) - if len(prompt_tokens) + max_tokens > int(llama_cpp.llama_n_ctx(self.ctx)): + if max_tokens <= 0: + # Unlimited, depending on n_ctx. + if len(prompt_tokens) >= int(llama_cpp.llama_n_ctx(self.ctx)): + raise ValueError( + f"Requested tokens exceed context window of {llama_cpp.llama_n_ctx(self.ctx)}" + ) + else: + max_tokens = int(llama_cpp.llama_n_ctx(self.ctx)) - len(prompt_tokens) + elif len(prompt_tokens) + max_tokens > int(llama_cpp.llama_n_ctx(self.ctx)): raise ValueError( f"Requested tokens exceed context window of {llama_cpp.llama_n_ctx(self.ctx)}" ) @@ -455,7 +463,7 @@ def create_completion( Args: prompt: The prompt to generate text from. suffix: A suffix to append to the generated text. If None, no suffix is appended. - max_tokens: The maximum number of tokens to generate. + max_tokens: The maximum number of tokens to generate. If max_tokens <= 0, the maximum number of tokens to generate is unlimited and depends on n_ctx. temperature: The temperature to use for sampling. top_p: The top-p value to use for sampling. logprobs: The number of logprobs to return. If None, no logprobs are returned. @@ -510,7 +518,7 @@ def __call__( Args: prompt: The prompt to generate text from. suffix: A suffix to append to the generated text. If None, no suffix is appended. - max_tokens: The maximum number of tokens to generate. + max_tokens: The maximum number of tokens to generate. If max_tokens <= 0, the maximum number of tokens to generate is unlimited and depends on n_ctx. temperature: The temperature to use for sampling. top_p: The top-p value to use for sampling. logprobs: The number of logprobs to return. If None, no logprobs are returned. @@ -619,7 +627,7 @@ def create_chat_completion( top_k: The top-k value to use for sampling. stream: Whether to stream the results. stop: A list of strings to stop generation when encountered. - max_tokens: The maximum number of tokens to generate. + max_tokens: The maximum number of tokens to generate. If max_tokens <= 0, the maximum number of tokens to generate is unlimited and depends on n_ctx. repeat_penalty: The penalty to apply to repeated tokens. Returns: From 036548365f16d05a638fe7c75a0d60fe452e954f Mon Sep 17 00:00:00 2001 From: SubhranshuSharma Date: Sat, 17 Jun 2023 14:50:07 +0530 Subject: [PATCH 02/48] added termux with root instructions --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 0e62f3dfa..80c63f567 100644 --- a/README.md +++ b/README.md @@ -126,6 +126,7 @@ A Docker image is available on [GHCR](https://ghcr.io/abetlen/llama-cpp-python). ```bash docker run --rm -it -p 8000:8000 -v /path/to/models:/models -e MODEL=/models/ggml-model-name.bin ghcr.io/abetlen/llama-cpp-python:latest ``` +[Docker on termux (requires root)](https://gist.github.com/FreddieOliveira/efe850df7ff3951cb62d74bd770dce27) is currently the only known way to run this on phones, see [termux support issue](https://github.com/abetlen/llama-cpp-python/issues/389) ## Low-level API From 98ae4e58a3adce4b3cf775121ee1f1ac2ce5ddb6 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 6 Jul 2023 17:57:56 -0400 Subject: [PATCH 03/48] Update llama.cpp --- Makefile | 3 +++ llama_cpp/llama_cpp.py | 39 +++++++++++++++++++++++++++++++++++++++ vendor/llama.cpp | 2 +- 3 files changed, 43 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 66d93f3a2..1be35cf8f 100644 --- a/Makefile +++ b/Makefile @@ -33,6 +33,9 @@ deploy.gh-docs: mkdocs build mkdocs gh-deploy +test: + python3 -m pytest + clean: - cd vendor/llama.cpp && make clean - cd vendor/llama.cpp && rm libllama.so diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index c68fb18d1..17c631961 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -2,6 +2,7 @@ import os import ctypes from ctypes import ( + c_double, c_int, c_float, c_char_p, @@ -169,6 +170,7 @@ class llama_token_data_array(Structure): # // context pointer passed to the progress callback # void * progress_callback_user_data; + # // Keep the booleans together to avoid misalignment during copy-by-value. # bool low_vram; // if true, reduce VRAM usage at the cost of performance # bool f16_kv; // use fp16 for KV cache @@ -256,6 +258,34 @@ class llama_model_quantize_params(Structure): ] +# // performance timing information +# struct llama_timings { +# double t_start_ms; +# double t_end_ms; +# double t_load_ms; +# double t_sample_ms; +# double t_p_eval_ms; +# double t_eval_ms; + + +# int32_t n_sample; +# int32_t n_p_eval; +# int32_t n_eval; +# }; +class llama_timings(Structure): + _fields_ = [ + ("t_start_ms", c_double), + ("t_end_ms", c_double), + ("t_load_ms", c_double), + ("t_sample_ms", c_double), + ("t_p_eval_ms", c_double), + ("t_eval_ms", c_double), + ("n_sample", c_int32), + ("n_p_eval", c_int32), + ("n_eval", c_int32), + ] + + # LLAMA_API struct llama_context_params llama_context_default_params(); def llama_context_default_params() -> llama_context_params: return _lib.llama_context_default_params() @@ -991,6 +1021,15 @@ def llama_sample_token( # Performance information +# LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx); +def llama_get_timings(ctx: llama_context_p) -> llama_timings: + return _lib.llama_get_timings(ctx) + + +_lib.llama_get_timings.argtypes = [llama_context_p] +_lib.llama_get_timings.restype = llama_timings + + # LLAMA_API void llama_print_timings(struct llama_context * ctx); def llama_print_timings(ctx: llama_context_p): _lib.llama_print_timings(ctx) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 7f0e9a775..dfd9fce6d 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 7f0e9a775ecc4c6ade271c217f63d6dc93e79eaa +Subproject commit dfd9fce6d65599bf33df43e616e85aa639bdae4c From 4c7cdcca00f63896a95e09a11f424237e224bc72 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 7 Jul 2023 03:04:17 -0400 Subject: [PATCH 04/48] Add interruptible streaming requests for llama-cpp-python server. Closes #183 --- CHANGELOG.md | 4 ++++ llama_cpp/server/app.py | 31 +++++++++++++++++++++++++------ 2 files changed, 29 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c6cfaab28..11251c6df 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [Added] + +- (server) Streaming requests can are now interrupted pre-maturely when a concurrent request is made. Can be controlled with the `interrupt_requests` setting. + ## [0.1.68] ## [Added] diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index ef319c7e0..b9d57717e 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -146,12 +146,27 @@ def set_settings(_settings: Settings): return app -llama_lock = Lock() +llama_outer_lock = Lock() +llama_inner_lock = Lock() def get_llama(): - with llama_lock: - yield llama + # NOTE: This double lock allows the currently streaming llama model to + # check if any other requests are pending in the same thread and cancel + # the stream if so. + llama_outer_lock.acquire() + release_outer_lock = True + try: + llama_inner_lock.acquire() + try: + llama_outer_lock.release() + release_outer_lock = False + yield llama + finally: + llama_inner_lock.release() + finally: + if release_outer_lock: + llama_outer_lock.release() def get_settings(): @@ -364,6 +379,9 @@ async def event_publisher(inner_send_chan: MemoryObjectSendStream): await inner_send_chan.send(dict(data=json.dumps(chunk))) if await request.is_disconnected(): raise anyio.get_cancelled_exc_class()() + if llama_outer_lock.locked(): + await inner_send_chan.send(dict(data="[DONE]")) + raise anyio.get_cancelled_exc_class()() await inner_send_chan.send(dict(data="[DONE]")) except anyio.get_cancelled_exc_class() as e: print("disconnected") @@ -371,7 +389,6 @@ async def event_publisher(inner_send_chan: MemoryObjectSendStream): print( f"Disconnected from client (via refresh/close) {request.client}" ) - await inner_send_chan.send(dict(closing=True)) raise e return EventSourceResponse( @@ -494,6 +511,9 @@ async def event_publisher(inner_send_chan: MemoryObjectSendStream): await inner_send_chan.send(dict(data=json.dumps(chat_chunk))) if await request.is_disconnected(): raise anyio.get_cancelled_exc_class()() + if llama_outer_lock.locked(): + await inner_send_chan.send(dict(data="[DONE]")) + raise anyio.get_cancelled_exc_class()() await inner_send_chan.send(dict(data="[DONE]")) except anyio.get_cancelled_exc_class() as e: print("disconnected") @@ -501,7 +521,6 @@ async def event_publisher(inner_send_chan: MemoryObjectSendStream): print( f"Disconnected from client (via refresh/close) {request.client}" ) - await inner_send_chan.send(dict(closing=True)) raise e return EventSourceResponse( @@ -533,8 +552,8 @@ class ModelList(TypedDict): @router.get("/v1/models", response_model=GetModelResponse) async def get_models( settings: Settings = Depends(get_settings), - llama: llama_cpp.Llama = Depends(get_llama), ) -> ModelList: + assert llama is not None return { "object": "list", "data": [ From cc542b4452ec92919bb2964e40314c7077c264be Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 7 Jul 2023 03:04:54 -0400 Subject: [PATCH 05/48] Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index dfd9fce6d..481f793ac 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit dfd9fce6d65599bf33df43e616e85aa639bdae4c +Subproject commit 481f793acc3882a09d45d8d2c3076ad3d1c60cfc From 57d8ec3899f2c48def77f8cf3d3feae45ca12aa3 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 7 Jul 2023 03:37:23 -0400 Subject: [PATCH 06/48] Add setting to control request interruption --- llama_cpp/server/app.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index b9d57717e..5d47160e9 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -85,6 +85,10 @@ class Settings(BaseSettings): port: int = Field( default=8000, description="Listen port" ) + interrupt_requests: bool = Field( + default=True, + description="Whether to interrupt requests when a new request is received.", + ) router = APIRouter() @@ -379,7 +383,7 @@ async def event_publisher(inner_send_chan: MemoryObjectSendStream): await inner_send_chan.send(dict(data=json.dumps(chunk))) if await request.is_disconnected(): raise anyio.get_cancelled_exc_class()() - if llama_outer_lock.locked(): + if settings.interrupt_requests and llama_outer_lock.locked(): await inner_send_chan.send(dict(data="[DONE]")) raise anyio.get_cancelled_exc_class()() await inner_send_chan.send(dict(data="[DONE]")) @@ -486,6 +490,7 @@ async def create_chat_completion( request: Request, body: CreateChatCompletionRequest, llama: llama_cpp.Llama = Depends(get_llama), + settings: Settings = Depends(get_settings), ) -> Union[llama_cpp.ChatCompletion, EventSourceResponse]: exclude = { "n", @@ -511,7 +516,7 @@ async def event_publisher(inner_send_chan: MemoryObjectSendStream): await inner_send_chan.send(dict(data=json.dumps(chat_chunk))) if await request.is_disconnected(): raise anyio.get_cancelled_exc_class()() - if llama_outer_lock.locked(): + if settings.interrupt_requests and llama_outer_lock.locked(): await inner_send_chan.send(dict(data="[DONE]")) raise anyio.get_cancelled_exc_class()() await inner_send_chan.send(dict(data="[DONE]")) From ca11673061ecd9198b4800f68073ae14d4440ecd Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 7 Jul 2023 03:38:51 -0400 Subject: [PATCH 07/48] Add universal docker image --- Makefile | 7 +++++++ docker/simple/Dockerfile | 33 +++++++++++++++++++++++++++++++++ docker/simple/run.sh | 4 ++++ 3 files changed, 44 insertions(+) create mode 100644 docker/simple/Dockerfile create mode 100644 docker/simple/run.sh diff --git a/Makefile b/Makefile index 1be35cf8f..c359260b6 100644 --- a/Makefile +++ b/Makefile @@ -36,6 +36,12 @@ deploy.gh-docs: test: python3 -m pytest +docker: + docker build -t llama-cpp-python:latest -f docker/simple/Dockerfile . + +run-server: + uvicorn --factory llama.server:app --host ${HOST} --port ${PORT} + clean: - cd vendor/llama.cpp && make clean - cd vendor/llama.cpp && rm libllama.so @@ -56,4 +62,5 @@ clean: build.sdist \ deploy.pypi \ deploy.gh-docs \ + docker \ clean \ No newline at end of file diff --git a/docker/simple/Dockerfile b/docker/simple/Dockerfile new file mode 100644 index 000000000..ad36b989a --- /dev/null +++ b/docker/simple/Dockerfile @@ -0,0 +1,33 @@ +# Define the image argument and provide a default value +ARG IMAGE=python:3-slim-bullseye + +# Use the image as specified +FROM ${IMAGE} + +# Re-declare the ARG after FROM +ARG IMAGE + +# Update and upgrade the existing packages +RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-recommends \ + python3 \ + python3-pip \ + ninja-build \ + build-essential + +RUN mkdir /app +WORKDIR /app +COPY . /app + +RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette + +RUN make build && make clean + +# Set environment variable for the host +ENV HOST=0.0.0.0 +ENV PORT=8000 + +# Expose a port for the server +EXPOSE 8000 + +# Run the server start script +CMD ["/bin/sh", "/app/docker/simple/run.sh"] diff --git a/docker/simple/run.sh b/docker/simple/run.sh new file mode 100644 index 000000000..c85e73d2b --- /dev/null +++ b/docker/simple/run.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +make build +uvicorn --factory llama_cpp.server.app:create_app --host $HOST --port $PORT From d270ec231ad620beeb20da93de3b05f7a2d55cb4 Mon Sep 17 00:00:00 2001 From: Audrey Roy Greenfeld Date: Fri, 7 Jul 2023 11:15:04 +0100 Subject: [PATCH 08/48] Update macOS Metal GPU step 4 * Update "today" to version 0.1.62 * Fix numbering (there were 2 step 4's) --- docs/install/macos.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/install/macos.md b/docs/install/macos.md index 600469615..3330396e3 100644 --- a/docs/install/macos.md +++ b/docs/install/macos.md @@ -26,19 +26,19 @@ conda create -n llama python=3.9.16 conda activate llama ``` -**(4) Install the LATEST llama-cpp-python.. which, as of just today, happily supports MacOS Metal GPU** +**(4) Install the LATEST llama-cpp-python...which happily supports MacOS Metal GPU as of version 0.1.62** *(you needed xcode installed in order pip to build/compile the C++ code)* ``` pip uninstall llama-cpp-python -y CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install -U llama-cpp-python --no-cache-dir pip install 'llama-cpp-python[server]' -# you should now have llama-cpp-python v0.1.62 installed -llama-cpp-python         0.1.62      +# you should now have llama-cpp-python v0.1.62 or higher installed +llama-cpp-python         0.1.68 ``` -**(4) Download a v3 ggml model** +**(5) Download a v3 ggml model** - **ggmlv3** - file name ends with **q4_0.bin** - indicating it is 4bit quantized, with quantisation method 0 From 9e61661518d78973555cb0424d371e943674cd88 Mon Sep 17 00:00:00 2001 From: wu-qing-157 Date: Fri, 7 Jul 2023 10:18:49 +0000 Subject: [PATCH 09/48] fix indexing token_logprobs after sorting --- llama_cpp/llama.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 688b2a74f..31d70b7ec 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -958,7 +958,7 @@ def _create_completion( ) ], "text_offset": [text_offset], - "token_logprobs": [sorted_logprobs[int(token)][0]], + "token_logprobs": [current_logprobs[int(token)]], "top_logprobs": [top_logprob], } returned_tokens += 1 @@ -1033,7 +1033,7 @@ def _create_completion( self.detokenize([token]).decode("utf-8", errors="ignore") ], "text_offset": [text_offset], - "token_logprobs": [sorted_logprobs[int(token)][0]], + "token_logprobs": [current_logprobs[int(token)]], "top_logprobs": [top_logprob], } @@ -1131,7 +1131,7 @@ def _create_completion( zip(logprobs_token, range(len(logprobs_token))), reverse=True ) ) - token_logprobs.append(sorted_logprobs[int(token)][0]) + token_logprobs.append(logprobs_token[int(token)]) top_logprob: Optional[Dict[str, float]] = { self.detokenize([i]).decode("utf-8", errors="ignore"): logprob for logprob, i in sorted_logprobs[:logprobs] From a14d8a9b3fdc2f967c7c8905fe7911bddb0935a0 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 7 Jul 2023 18:58:43 -0400 Subject: [PATCH 10/48] perf: assign to candidates data structure instead --- llama_cpp/llama.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 688b2a74f..35823cfa3 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -487,9 +487,9 @@ def _sample( nl_logit = logits[self._token_nl] candidates = self._candidates candidates_data = self._candidates_data - candidates_data["id"] = np.arange(n_vocab, dtype=np.intc) # type: ignore - candidates_data["logit"] = logits - candidates_data["p"] = np.zeros(n_vocab, dtype=np.single) + candidates_data["id"][:] = np.arange(n_vocab, dtype=np.intc) # type: ignore + candidates_data["logit"][:] = logits + candidates_data["p"][:] = np.zeros(n_vocab, dtype=np.single) candidates.data = candidates_data.ctypes.data_as(llama_cpp.llama_token_data_p) candidates.sorted = llama_cpp.c_bool(False) candidates.size = llama_cpp.c_size_t(n_vocab) From 7887376bffec533083f2d2170424db076089c39d Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 7 Jul 2023 19:06:54 -0400 Subject: [PATCH 11/48] Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 481f793ac..061f5f8d2 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 481f793acc3882a09d45d8d2c3076ad3d1c60cfc +Subproject commit 061f5f8d2109bb7adcbd40f1b456d887c5a1df25 From 11eae752110f3f69088c6a551c965f42f1507148 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 7 Jul 2023 19:28:53 -0400 Subject: [PATCH 12/48] perf: avoid allocating new buffers during sampling --- llama_cpp/llama.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 35823cfa3..089518255 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -324,6 +324,8 @@ def __init__( self._candidates = candidates self._token_nl = Llama.token_nl() self._token_eos = Llama.token_eos() + self._candidates_data_id = np.arange(self._n_vocab, dtype=np.intc) # type: ignore + self._candidates_data_p = np.zeros(self._n_vocab, dtype=np.single) self.n_tokens = 0 self.input_ids: npt.NDArray[np.intc] = np.ndarray((n_ctx,), dtype=np.intc) @@ -487,9 +489,9 @@ def _sample( nl_logit = logits[self._token_nl] candidates = self._candidates candidates_data = self._candidates_data - candidates_data["id"][:] = np.arange(n_vocab, dtype=np.intc) # type: ignore + candidates_data["id"][:] = self._candidates_data_id # type: ignore candidates_data["logit"][:] = logits - candidates_data["p"][:] = np.zeros(n_vocab, dtype=np.single) + candidates_data["p"][:] = self._candidates_data_p # type: ignore candidates.data = candidates_data.ctypes.data_as(llama_cpp.llama_token_data_p) candidates.sorted = llama_cpp.c_bool(False) candidates.size = llama_cpp.c_size_t(n_vocab) From 52753b77f556c46057f5272b2ee547868cf53397 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 7 Jul 2023 21:38:46 -0400 Subject: [PATCH 13/48] Upgrade fastapi to 0.100.0 and pydantic v2 --- .github/workflows/test.yaml | 6 +++--- docker/cuda_simple/Dockerfile | 2 +- docker/open_llama/Dockerfile | 2 +- docker/openblas_simple/Dockerfile | 2 +- docker/simple/Dockerfile | 2 +- llama_cpp/server/__main__.py | 4 ++-- llama_cpp/server/app.py | 14 ++++---------- pyproject.toml | 2 +- setup.py | 2 +- 9 files changed, 15 insertions(+), 21 deletions(-) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 56524e0db..a73e347b5 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -26,7 +26,7 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | - python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi sse-starlette httpx uvicorn + python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi sse-starlette httpx uvicorn pydantic-settings pip install . -v - name: Test with pytest run: | @@ -49,7 +49,7 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | - python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi sse-starlette httpx uvicorn + python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi sse-starlette httpx uvicorn pydantic-settings pip install . -v - name: Test with pytest run: | @@ -72,7 +72,7 @@ jobs: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | - python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi sse-starlette httpx uvicorn + python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi sse-starlette httpx uvicorn pydantic-settings pip install . -v - name: Test with pytest run: | diff --git a/docker/cuda_simple/Dockerfile b/docker/cuda_simple/Dockerfile index 24906d53a..e4a2f07e2 100644 --- a/docker/cuda_simple/Dockerfile +++ b/docker/cuda_simple/Dockerfile @@ -8,7 +8,7 @@ COPY . . # Install the package RUN apt update && apt install -y python3 python3-pip -RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette +RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings RUN LLAMA_CUBLAS=1 pip install llama-cpp-python diff --git a/docker/open_llama/Dockerfile b/docker/open_llama/Dockerfile index f0ef5f721..7788f33de 100644 --- a/docker/open_llama/Dockerfile +++ b/docker/open_llama/Dockerfile @@ -14,7 +14,7 @@ RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-reco ninja-build \ build-essential -RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette +RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings # Perform the conditional installations based on the image RUN echo "Image: ${IMAGE}" && \ diff --git a/docker/openblas_simple/Dockerfile b/docker/openblas_simple/Dockerfile index 1a95caeda..8231bdb96 100644 --- a/docker/openblas_simple/Dockerfile +++ b/docker/openblas_simple/Dockerfile @@ -7,7 +7,7 @@ COPY . . # Install the package RUN apt update && apt install -y libopenblas-dev ninja-build build-essential -RUN python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette +RUN python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings RUN LLAMA_OPENBLAS=1 pip install llama_cpp_python --verbose diff --git a/docker/simple/Dockerfile b/docker/simple/Dockerfile index ad36b989a..77680c811 100644 --- a/docker/simple/Dockerfile +++ b/docker/simple/Dockerfile @@ -18,7 +18,7 @@ RUN mkdir /app WORKDIR /app COPY . /app -RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette +RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings RUN make build && make clean diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py index 748a2af33..2110db31f 100644 --- a/llama_cpp/server/__main__.py +++ b/llama_cpp/server/__main__.py @@ -3,7 +3,7 @@ To run this example: ```bash -pip install fastapi uvicorn sse-starlette +pip install fastapi uvicorn sse-starlette pydantic-settings export MODEL=../models/7B/... ``` @@ -30,7 +30,7 @@ if __name__ == "__main__": parser = argparse.ArgumentParser() - for name, field in Settings.__fields__.items(): + for name, field in Settings.__model_fields__.items(): description = field.field_info.description if field.default is not None and description is not None: description += f" (default: {field.default})" diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 5d47160e9..ffd07fa6b 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -12,7 +12,8 @@ from starlette.concurrency import run_in_threadpool, iterate_in_threadpool from fastapi import Depends, FastAPI, APIRouter, Request from fastapi.middleware.cors import CORSMiddleware -from pydantic import BaseModel, BaseSettings, Field, create_model_from_typeddict +from pydantic import BaseModel, Field +from pydantic_settings import BaseSettings from sse_starlette.sse import EventSourceResponse @@ -309,7 +310,6 @@ class Config: } -CreateCompletionResponse = create_model_from_typeddict(llama_cpp.Completion) def make_logit_bias_processor( @@ -347,7 +347,6 @@ def logit_bias_processor( @router.post( "/v1/completions", - response_model=CreateCompletionResponse, ) async def create_completion( request: Request, @@ -416,12 +415,10 @@ class Config: } -CreateEmbeddingResponse = create_model_from_typeddict(llama_cpp.Embedding) @router.post( "/v1/embeddings", - response_model=CreateEmbeddingResponse, ) async def create_embedding( request: CreateEmbeddingRequest, llama: llama_cpp.Llama = Depends(get_llama) @@ -479,19 +476,17 @@ class Config: } -CreateChatCompletionResponse = create_model_from_typeddict(llama_cpp.ChatCompletion) @router.post( "/v1/chat/completions", - response_model=CreateChatCompletionResponse, ) async def create_chat_completion( request: Request, body: CreateChatCompletionRequest, llama: llama_cpp.Llama = Depends(get_llama), settings: Settings = Depends(get_settings), -) -> Union[llama_cpp.ChatCompletion, EventSourceResponse]: +) -> Union[llama_cpp.ChatCompletion]: # type: ignore exclude = { "n", "logit_bias", @@ -551,10 +546,9 @@ class ModelList(TypedDict): data: List[ModelData] -GetModelResponse = create_model_from_typeddict(ModelList) -@router.get("/v1/models", response_model=GetModelResponse) +@router.get("/v1/models") async def get_models( settings: Settings = Depends(get_settings), ) -> ModelList: diff --git a/pyproject.toml b/pyproject.toml index b3ad3b411..841a86869 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,7 +32,7 @@ httpx = "^0.24.1" scikit-build = "0.17.6" [tool.poetry.extras] -server = ["uvicorn", "fastapi", "sse-starlette"] +server = ["uvicorn>=0.22.0", "fastapi>=0.100.0", "pydantic-settings>=2.0.1", "sse-starlette>=1.6.1"] [build-system] requires = [ diff --git a/setup.py b/setup.py index 32101eb07..1d7ecbce0 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,7 @@ packages=["llama_cpp", "llama_cpp.server"], install_requires=["typing-extensions>=4.5.0", "numpy>=1.20.0", "diskcache>=5.6.1"], extras_require={ - "server": ["uvicorn>=0.21.1", "fastapi>=0.95.0", "sse-starlette>=1.3.3"], + "server": ["uvicorn>=0.22.1", "fastapi>=0.100.0", "pydantic-settings>=2.0.1", "sse-starlette>=1.6.1"], }, python_requires=">=3.7", classifiers=[ From 34c505edf2609acef51b47533f10cd2b8dc2f715 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 7 Jul 2023 22:54:07 -0400 Subject: [PATCH 14/48] perf: convert pointer to byref --- llama_cpp/llama.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 089518255..130e01390 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -537,7 +537,7 @@ def _sample( mirostat_mu = llama_cpp.c_float(2.0 * mirostat_tau.value) llama_cpp.llama_sample_temperature( ctx=self.ctx, - candidates=llama_cpp.ctypes.pointer(candidates), + candidates=llama_cpp.ctypes.byref(candidates), # type: ignore temp=temp, ) return llama_cpp.llama_sample_token_mirostat_v2( From ea4fbadab39548673e2a835223968b023006e539 Mon Sep 17 00:00:00 2001 From: AgentJ-WR <60302956+AgentJ-WR@users.noreply.github.com> Date: Fri, 7 Jul 2023 23:24:57 -0400 Subject: [PATCH 15/48] Show how to adjust context window in README.md --- README.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/README.md b/README.md index fb652a925..0322c73a3 100644 --- a/README.md +++ b/README.md @@ -105,6 +105,15 @@ Below is a short example demonstrating how to use the high-level API to generate } ``` +### Adjusting the Context Window +The context window of the Llama models determines the maximum number of tokens that can be processed at once. By default, this is set to 512 tokens, but can be adjusted based on your requirements. + +For instance, if you want to work with larger contexts, you can expand the context window by setting the n_ctx parameter when initializing the Llama object: + +```python +llm = Llama(model_path="./models/7B/ggml-model.bin", n_ctx=2048) +``` + ## Web Server `llama-cpp-python` offers a web server which aims to act as a drop-in replacement for the OpenAI API. From 4f2b5d0b5321bedc879ee9b9a19ca15d18ddb995 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 8 Jul 2023 00:05:10 -0400 Subject: [PATCH 16/48] Format --- llama_cpp/llama.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 130e01390..f8e05271c 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -324,7 +324,7 @@ def __init__( self._candidates = candidates self._token_nl = Llama.token_nl() self._token_eos = Llama.token_eos() - self._candidates_data_id = np.arange(self._n_vocab, dtype=np.intc) # type: ignore + self._candidates_data_id = np.arange(self._n_vocab, dtype=np.intc) # type: ignore self._candidates_data_p = np.zeros(self._n_vocab, dtype=np.single) self.n_tokens = 0 @@ -445,8 +445,12 @@ def eval(self, tokens: Sequence[int]): # Save logits rows = n_tokens if self.params.logits_all else 1 cols = self._n_vocab - offset = 0 if self.params.logits_all else n_tokens - 1 # NOTE: Only save the last token logits if logits_all is False - self.scores[self.n_tokens + offset: self.n_tokens + n_tokens, :].reshape(-1)[:] = llama_cpp.llama_get_logits(self.ctx)[:rows * cols] + offset = ( + 0 if self.params.logits_all else n_tokens - 1 + ) # NOTE: Only save the last token logits if logits_all is False + self.scores[self.n_tokens + offset : self.n_tokens + n_tokens, :].reshape( + -1 + )[:] = llama_cpp.llama_get_logits(self.ctx)[: rows * cols] # Update n_tokens self.n_tokens += n_tokens @@ -491,7 +495,7 @@ def _sample( candidates_data = self._candidates_data candidates_data["id"][:] = self._candidates_data_id # type: ignore candidates_data["logit"][:] = logits - candidates_data["p"][:] = self._candidates_data_p # type: ignore + candidates_data["p"][:] = self._candidates_data_p # type: ignore candidates.data = candidates_data.ctypes.data_as(llama_cpp.llama_token_data_p) candidates.sorted = llama_cpp.c_bool(False) candidates.size = llama_cpp.c_size_t(n_vocab) @@ -537,7 +541,7 @@ def _sample( mirostat_mu = llama_cpp.c_float(2.0 * mirostat_tau.value) llama_cpp.llama_sample_temperature( ctx=self.ctx, - candidates=llama_cpp.ctypes.byref(candidates), # type: ignore + candidates=llama_cpp.ctypes.byref(candidates), # type: ignore temp=temp, ) return llama_cpp.llama_sample_token_mirostat_v2( From d6e6aad927690d4bb3229be3f7980a64e46d4866 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 8 Jul 2023 00:06:11 -0400 Subject: [PATCH 17/48] bugfix: fix compatibility bug with openai api on last token --- llama_cpp/llama.py | 36 ++++++++++++++++++++++++++++++++---- llama_cpp/llama_types.py | 6 ++++-- 2 files changed, 36 insertions(+), 6 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index f8e05271c..d7d3e85e7 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -1060,6 +1060,20 @@ def _create_completion( ].decode("utf-8", errors="ignore"), "index": 0, "logprobs": logprobs_or_none, + "finish_reason": None, + } + ], + } + yield { + "id": completion_id, + "object": "text_completion", + "created": created, + "model": model_name, + "choices": [ + { + "text": "", + "index": 0, + "logprobs": None, "finish_reason": finish_reason, } ], @@ -1078,9 +1092,21 @@ def _create_completion( ), "index": 0, "logprobs": logprobs_or_none, - "finish_reason": finish_reason - if returned_tokens == len(completion_tokens) - else None, + "finish_reason": None, + } + ], + } + yield { + "id": completion_id, + "object": "text_completion", + "created": created, + "model": model_name, + "choices": [ + { + "text": "", + "index": 0, + "logprobs": None, + "finish_reason": finish_reason, } ], } @@ -1370,7 +1396,9 @@ def _convert_text_completion_chunks_to_chat( "index": 0, "delta": { "content": chunk["choices"][0]["text"], - }, + } + if chunk["choices"][0]["finish_reason"] is None + else {}, "finish_reason": chunk["choices"][0]["finish_reason"], } ], diff --git a/llama_cpp/llama_types.py b/llama_cpp/llama_types.py index 7729ced5a..6ba8023bd 100644 --- a/llama_cpp/llama_types.py +++ b/llama_cpp/llama_types.py @@ -1,4 +1,4 @@ -from typing import List, Optional, Dict +from typing import Any, List, Optional, Dict, Union from typing_extensions import TypedDict, NotRequired, Literal @@ -77,6 +77,8 @@ class ChatCompletion(TypedDict): choices: List[ChatCompletionChoice] usage: CompletionUsage +class ChatCompletionChunkDeltaEmpty(TypedDict): + pass class ChatCompletionChunkDelta(TypedDict): role: NotRequired[Literal["assistant"]] @@ -85,7 +87,7 @@ class ChatCompletionChunkDelta(TypedDict): class ChatCompletionChunkChoice(TypedDict): index: int - delta: ChatCompletionChunkDelta + delta: Union[ChatCompletionChunkDelta, ChatCompletionChunkDeltaEmpty] finish_reason: Optional[str] From 670fe4b701b2c8a8a97bac5293bb65004507a1f8 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 8 Jul 2023 03:37:12 -0400 Subject: [PATCH 18/48] Update changelog --- CHANGELOG.md | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 11251c6df..805d7be2b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,30 +7,36 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] -## [Added] +### Added - (server) Streaming requests can are now interrupted pre-maturely when a concurrent request is made. Can be controlled with the `interrupt_requests` setting. +- (server) Moved to fastapi v0.100.0 and pydantic v2 +- (docker) Added a new "simple" image that builds llama.cpp from source when started. + +## Fixed + +- (server) performance improvements by avoiding unnecessary memory allocations during sampling ## [0.1.68] -## [Added] +### Added - (llama.cpp) Update llama.cpp ## [0.1.67] -## Fixed +### Fixed - Fix performance bug in Llama model by pre-allocating memory tokens and logits. - Fix bug in Llama model where the model was not free'd after use. ## [0.1.66] -## Added +### Added - (llama.cpp) New model API -## Fixed +### Fixed - Performance issue during eval caused by looped np.concatenate call - State pickling issue when saving cache to disk From 3a2635b9e1d591a4823f1d302e12cdc2a84a8b18 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 8 Jul 2023 03:37:28 -0400 Subject: [PATCH 19/48] Update docker workflow for new simple image --- .github/workflows/build-docker.yaml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/.github/workflows/build-docker.yaml b/.github/workflows/build-docker.yaml index 16b00a2f0..25669b77d 100644 --- a/.github/workflows/build-docker.yaml +++ b/.github/workflows/build-docker.yaml @@ -11,10 +11,6 @@ jobs: name: Build and push Docker image runs-on: ubuntu-latest steps: - - name: Checkout - uses: actions/checkout@v3 - with: - submodules: "true" - name: Set up QEMU uses: docker/setup-qemu-action@v2 @@ -33,6 +29,7 @@ jobs: uses: docker/build-push-action@v4 with: context: . + path: ./docker/simple/Dockerfile push: true # push to registry pull: true # always fetch the latest base images platforms: linux/amd64,linux/arm64 # build for both amd64 and arm64 From 5b7d76608d8169e6d7696c12f0366e6ccdd6cd0c Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 8 Jul 2023 03:43:17 -0400 Subject: [PATCH 20/48] docker: add checkout action to dockerfile --- .github/workflows/build-docker.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/build-docker.yaml b/.github/workflows/build-docker.yaml index 25669b77d..e0bf79ce1 100644 --- a/.github/workflows/build-docker.yaml +++ b/.github/workflows/build-docker.yaml @@ -11,6 +11,10 @@ jobs: name: Build and push Docker image runs-on: ubuntu-latest steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: "true" - name: Set up QEMU uses: docker/setup-qemu-action@v2 From 9e153fd11d1de032181619206213504c3ade1068 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 8 Jul 2023 03:44:51 -0400 Subject: [PATCH 21/48] docker: update context path --- .github/workflows/build-docker.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-docker.yaml b/.github/workflows/build-docker.yaml index e0bf79ce1..04761e53d 100644 --- a/.github/workflows/build-docker.yaml +++ b/.github/workflows/build-docker.yaml @@ -33,7 +33,7 @@ jobs: uses: docker/build-push-action@v4 with: context: . - path: ./docker/simple/Dockerfile + path: "{context}/docker/simple/Dockerfile" push: true # push to registry pull: true # always fetch the latest base images platforms: linux/amd64,linux/arm64 # build for both amd64 and arm64 From 1f5e748a7e284c6376d637dd11e54be611972483 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 8 Jul 2023 04:00:43 -0400 Subject: [PATCH 22/48] docker: fix docker build action args --- .github/workflows/build-docker.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-docker.yaml b/.github/workflows/build-docker.yaml index 04761e53d..0e1738fd4 100644 --- a/.github/workflows/build-docker.yaml +++ b/.github/workflows/build-docker.yaml @@ -33,7 +33,7 @@ jobs: uses: docker/build-push-action@v4 with: context: . - path: "{context}/docker/simple/Dockerfile" + file: "{context}/docker/simple/Dockerfile" push: true # push to registry pull: true # always fetch the latest base images platforms: linux/amd64,linux/arm64 # build for both amd64 and arm64 From 3c85c4157391364e2a7a3ff5818c5673d5f61696 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 8 Jul 2023 04:04:11 -0400 Subject: [PATCH 23/48] docker: update path to dockerfile --- .github/workflows/build-docker.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-docker.yaml b/.github/workflows/build-docker.yaml index 0e1738fd4..4b38dbacb 100644 --- a/.github/workflows/build-docker.yaml +++ b/.github/workflows/build-docker.yaml @@ -33,7 +33,7 @@ jobs: uses: docker/build-push-action@v4 with: context: . - file: "{context}/docker/simple/Dockerfile" + file: "docker/simple/Dockerfile" push: true # push to registry pull: true # always fetch the latest base images platforms: linux/amd64,linux/arm64 # build for both amd64 and arm64 From 00da643929d4ce81437f3b77cf29ec0a9af18b55 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 8 Jul 2023 20:30:34 -0400 Subject: [PATCH 24/48] Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 061f5f8d2..64639555f 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 061f5f8d2109bb7adcbd40f1b456d887c5a1df25 +Subproject commit 64639555ff93c8ead2b80becb49cc6b60aeac240 From 99f064e6812775fa74e0bcb90f069b9025940a3e Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 9 Jul 2023 01:36:39 -0400 Subject: [PATCH 25/48] docker: Add libopenblas to simple image --- docker/simple/Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/docker/simple/Dockerfile b/docker/simple/Dockerfile index 77680c811..507b2ba46 100644 --- a/docker/simple/Dockerfile +++ b/docker/simple/Dockerfile @@ -12,6 +12,7 @@ RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-reco python3 \ python3-pip \ ninja-build \ + libopenblas-dev \ build-essential RUN mkdir /app From 9f21f548a5a61e96e637c6baea7596fd8cdc0b01 Mon Sep 17 00:00:00 2001 From: Shouyi Wang Date: Fri, 7 Jul 2023 19:22:10 +1000 Subject: [PATCH 26/48] Add tensor split --- llama_cpp/llama.py | 13 ++++++++++++- llama_cpp/server/app.py | 5 +++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 62e0daee3..aefb8a3af 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -19,6 +19,7 @@ from collections import deque, OrderedDict import diskcache +import ctypes from . import llama_cpp from .llama_types import * @@ -26,7 +27,6 @@ import numpy as np import numpy.typing as npt - class BaseLlamaCache(ABC): """Base cache class for a llama.cpp model.""" @@ -207,6 +207,7 @@ def __init__( n_ctx: int = 512, n_parts: int = -1, n_gpu_layers: int = 0, + tensor_split: list[float] = None, seed: int = 1337, f16_kv: bool = True, logits_all: bool = False, @@ -248,12 +249,20 @@ def __init__( Returns: A Llama instance. """ + if tensor_split is None: + tensor_split = [0.0] * llama_cpp.LLAMA_MAX_DEVICES.value + + #Type conversion and expand the list to the length of LLAMA_MAX_DEVICES + FloatArray = ctypes.c_float * llama_cpp.LLAMA_MAX_DEVICES.value + c_tensor_split = FloatArray(*tensor_split) + self.verbose = verbose self.model_path = model_path self.params = llama_cpp.llama_context_default_params() self.params.n_ctx = n_ctx self.params.n_gpu_layers = n_gpu_layers + self.params.tensor_split = c_tensor_split self.params.seed = seed self.params.f16_kv = f16_kv self.params.logits_all = logits_all @@ -1494,6 +1503,7 @@ def __getstate__(self): model_path=self.model_path, n_ctx=self.params.n_ctx, n_gpu_layers=self.params.n_gpu_layers, + tensor_split=self.params.tensor_split, seed=self.params.seed, f16_kv=self.params.f16_kv, logits_all=self.params.logits_all, @@ -1518,6 +1528,7 @@ def __setstate__(self, state): n_ctx=state["n_ctx"], n_parts=state["n_parts"], n_gpu_layers=state["n_gpu_layers"], + tensor_split=state["tensor_split"], seed=state["seed"], f16_kv=state["f16_kv"], logits_all=state["logits_all"], diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index ffd07fa6b..7b09d8443 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -31,6 +31,10 @@ class Settings(BaseSettings): ge=0, description="The number of layers to put on the GPU. The rest will be on the CPU.", ) + tensor_split: List[float] = Field( + default=None, + description="Split layers across multiple GPUs in proportion.", + ) seed: int = Field( default=1337, description="Random seed. -1 for random." ) @@ -117,6 +121,7 @@ def create_app(settings: Optional[Settings] = None): llama = llama_cpp.Llama( model_path=settings.model, n_gpu_layers=settings.n_gpu_layers, + tensor_split=settings.tensor_split, seed=settings.seed, f16_kv=settings.f16_kv, use_mlock=settings.use_mlock, From 9aa64163dbf57a6f36b80ba1b8399b050607b9c7 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 9 Jul 2023 11:40:59 -0400 Subject: [PATCH 27/48] Update llama.cpp --- vendor/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 64639555f..1d1630996 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 64639555ff93c8ead2b80becb49cc6b60aeac240 +Subproject commit 1d1630996920f889cdc08de26cebf2415958540e From 0f3c474a49af412117449b19a2844f84c23205ca Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 9 Jul 2023 11:44:29 -0400 Subject: [PATCH 28/48] Bump version --- CHANGELOG.md | 2 ++ pyproject.toml | 2 +- setup.py | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 805d7be2b..0e181d691 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.1.69] + ### Added - (server) Streaming requests can are now interrupted pre-maturely when a concurrent request is made. Can be controlled with the `interrupt_requests` setting. diff --git a/pyproject.toml b/pyproject.toml index 841a86869..fb1962936 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "llama_cpp_python" -version = "0.1.68" +version = "0.1.69" description = "Python bindings for the llama.cpp library" authors = ["Andrei Betlen "] license = "MIT" diff --git a/setup.py b/setup.py index 1d7ecbce0..baaabcc47 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ description="A Python wrapper for llama.cpp", long_description=long_description, long_description_content_type="text/markdown", - version="0.1.68", + version="0.1.69", author="Andrei Betlen", author_email="abetlen@gmail.com", license="MIT", From 6f70cc4b7dd950a95708ed7e7da9ac550e87a76c Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 9 Jul 2023 18:03:31 -0400 Subject: [PATCH 29/48] bugfix: pydantic settings missing / changed fields --- llama_cpp/server/__main__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py index 2110db31f..995dd4449 100644 --- a/llama_cpp/server/__main__.py +++ b/llama_cpp/server/__main__.py @@ -30,14 +30,14 @@ if __name__ == "__main__": parser = argparse.ArgumentParser() - for name, field in Settings.__model_fields__.items(): - description = field.field_info.description + for name, field in Settings.model_fields.items(): + description = field.description if field.default is not None and description is not None: description += f" (default: {field.default})" parser.add_argument( f"--{name}", dest=name, - type=field.type_, + type=field.annotation if field.annotation is not None else str, help=description, ) From a86bfdf0a50f23a6aebb3f095ada0afcf8791d6e Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 9 Jul 2023 18:13:29 -0400 Subject: [PATCH 30/48] bugfix: truncate completion max_tokens to fit context length by default --- llama_cpp/llama.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 62e0daee3..edb68c9e5 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -824,19 +824,15 @@ def _create_completion( if self.verbose: llama_cpp.llama_reset_timings(self.ctx) - if max_tokens <= 0: - # Unlimited, depending on n_ctx. - if len(prompt_tokens) >= int(llama_cpp.llama_n_ctx(self.ctx)): - raise ValueError( - f"Requested tokens exceed context window of {llama_cpp.llama_n_ctx(self.ctx)}" - ) - else: - max_tokens = int(llama_cpp.llama_n_ctx(self.ctx)) - len(prompt_tokens) - elif len(prompt_tokens) + max_tokens > int(llama_cpp.llama_n_ctx(self.ctx)): + if len(prompt_tokens) >= llama_cpp.llama_n_ctx(self.ctx): raise ValueError( - f"Requested tokens ({len(prompt_tokens)}) exceed context window of {self._n_ctx}" + f"Requested tokens exceed context window of {llama_cpp.llama_n_ctx(self.ctx)}" ) + if max_tokens <= 0: + # Unlimited, depending on n_ctx. + max_tokens = llama_cpp.llama_n_ctx(self.ctx) - len(prompt_tokens) + # Truncate max_tokens if requested tokens would exceed the context window max_tokens = ( max_tokens From df3d54593868fbe5e8e488cd0c7a638971fbd3b8 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 9 Jul 2023 18:13:41 -0400 Subject: [PATCH 31/48] Update changelog --- CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0e181d691..40974132b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Fixed + +- (Llama.create_completion) Revert change so that `max_tokens` is not truncated to `context_size` in `create_completion` +- (server) Fixed changed settings field names from pydantic v2 migration + ## [0.1.69] ### Added From c988c2ac0b7611e4fe8001a28002767f37e09675 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 9 Jul 2023 18:19:37 -0400 Subject: [PATCH 32/48] Bump version --- pyproject.toml | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index fb1962936..a9e012e6b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "llama_cpp_python" -version = "0.1.69" +version = "0.1.70" description = "Python bindings for the llama.cpp library" authors = ["Andrei Betlen "] license = "MIT" diff --git a/setup.py b/setup.py index baaabcc47..b8acedb5a 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ description="A Python wrapper for llama.cpp", long_description=long_description, long_description_content_type="text/markdown", - version="0.1.69", + version="0.1.70", author="Andrei Betlen", author_email="abetlen@gmail.com", license="MIT", From 8e0f6253db0e8aa30bcc90fc26d49d221d003070 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sun, 9 Jul 2023 18:20:04 -0400 Subject: [PATCH 33/48] Bump version --- CHANGELOG.md | 2 ++ pyproject.toml | 2 +- setup.py | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 40974132b..8b5db37b0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.1.70] + ### Fixed - (Llama.create_completion) Revert change so that `max_tokens` is not truncated to `context_size` in `create_completion` diff --git a/pyproject.toml b/pyproject.toml index fb1962936..a9e012e6b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "llama_cpp_python" -version = "0.1.69" +version = "0.1.70" description = "Python bindings for the llama.cpp library" authors = ["Andrei Betlen "] license = "MIT" diff --git a/setup.py b/setup.py index baaabcc47..b8acedb5a 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ description="A Python wrapper for llama.cpp", long_description=long_description, long_description_content_type="text/markdown", - version="0.1.69", + version="0.1.70", author="Andrei Betlen", author_email="abetlen@gmail.com", license="MIT", From 3f8f276f9f79ec4394ba1b73f4d5f0afb11e2d96 Mon Sep 17 00:00:00 2001 From: randoentity <137087500+randoentity@users.noreply.github.com> Date: Sun, 9 Jul 2023 09:05:16 +0200 Subject: [PATCH 34/48] Add bindings for custom_rope --- llama_cpp/llama.py | 6 ++++++ llama_cpp/llama_cpp.py | 2 ++ vendor/llama.cpp | 2 +- 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index edb68c9e5..ada6d695b 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -205,6 +205,8 @@ def __init__( model_path: str, # NOTE: These parameters are likely to change in the future. n_ctx: int = 512, + rope_freq_base: float = 10000.0, + rope_freq_scale: float = 1.0, n_parts: int = -1, n_gpu_layers: int = 0, seed: int = 1337, @@ -227,6 +229,8 @@ def __init__( Args: model_path: Path to the model. n_ctx: Maximum context size. + rope_freq_base: RoPE base frequency. + rope_freq_scale: RoPE frequency scale. n_parts: Number of parts to split the model into. If -1, the number of parts is automatically determined. seed: Random seed. -1 for random. f16_kv: Use half-precision for key/value cache. @@ -253,6 +257,8 @@ def __init__( self.params = llama_cpp.llama_context_default_params() self.params.n_ctx = n_ctx + self.params.rope_freq_base = rope_freq_base + self.params.rope_freq_scale = rope_freq_scale self.params.n_gpu_layers = n_gpu_layers self.params.seed = seed self.params.f16_kv = f16_kv diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 17c631961..320c48b9c 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -184,6 +184,8 @@ class llama_context_params(Structure): _fields_ = [ ("seed", c_uint32), ("n_ctx", c_int32), + ("rope_freq_base", c_float), + ("rope_freq_scale", c_float), ("n_batch", c_int32), ("n_gpu_layers", c_int32), ("main_gpu", c_int32), diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 1d1630996..a3b4d9328 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 1d1630996920f889cdc08de26cebf2415958540e +Subproject commit a3b4d932859f4e51ed716bfa1f07e2d2eede2c23 From 7bb0024cd0c12d0d36207172410f13e1d343eeac Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Wed, 12 Jul 2023 19:31:43 -0400 Subject: [PATCH 35/48] Fix uvicorn dependency --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index b8acedb5a..ab5d825d4 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,7 @@ packages=["llama_cpp", "llama_cpp.server"], install_requires=["typing-extensions>=4.5.0", "numpy>=1.20.0", "diskcache>=5.6.1"], extras_require={ - "server": ["uvicorn>=0.22.1", "fastapi>=0.100.0", "pydantic-settings>=2.0.1", "sse-starlette>=1.6.1"], + "server": ["uvicorn>=0.22.0", "fastapi>=0.100.0", "pydantic-settings>=2.0.1", "sse-starlette>=1.6.1"], }, python_requires=">=3.7", classifiers=[ From 896ab7b88a45768dcb0e6038ed6ec8cbdd88a634 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 13 Jul 2023 23:24:55 -0400 Subject: [PATCH 36/48] Update llama.cpp --- llama_cpp/llama_cpp.py | 55 +++++++++++++++++++++++++++++++++++++----- vendor/llama.cpp | 2 +- 2 files changed, 50 insertions(+), 7 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 17c631961..b5bab56a3 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -326,13 +326,23 @@ def llama_mlock_supported() -> bool: # // Initialize the llama + ggml backend # // If numa is true, use NUMA optimizations # // Call once at the start of the program -# LLAMA_API void llama_init_backend(bool numa); -def llama_init_backend(numa: c_bool): - return _lib.llama_init_backend(numa) +# LLAMA_API void llama_backend_init(bool numa); +def llama_backend_init(numa: c_bool): + return _lib.llama_backend_init(numa) -_lib.llama_init_backend.argtypes = [c_bool] -_lib.llama_init_backend.restype = None +_lib.llama_backend_init.argtypes = [c_bool] +_lib.llama_backend_init.restype = None + + +# // Call once at the end of the program - currently only used for MPI +# LLAMA_API void llama_backend_free(); +def llama_backend_free(): + return _lib.llama_backend_free() + + +_lib.llama_backend_free.argtypes = [] +_lib.llama_backend_free.restype = None # LLAMA_API struct llama_model * llama_load_model_from_file( @@ -819,6 +829,39 @@ def llama_sample_frequency_and_presence_penalties( _lib.llama_sample_frequency_and_presence_penalties.restype = None +# /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806 +# /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted. +# /// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context. +# /// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance. +# /// @params smooth_factor Smooth factor between guidance logits and original logits. 1.0f means only use guidance logits. 0.0f means only original logits. +# LLAMA_API void llama_sample_classifier_free_guidance( +# struct llama_context * ctx, +# llama_token_data_array * candidates, +# struct llama_context * guidance_ctx, +# float scale, +# float smooth_factor); +def llama_sample_classifier_free_guidance( + ctx: llama_context_p, + candidates, # type: _Pointer[llama_token_data_array] + guidance_ctx: llama_context_p, + scale: c_float, + smooth_factor: c_float, +): + return _lib.llama_sample_classifier_free_guidance( + ctx, candidates, guidance_ctx, scale, smooth_factor + ) + + +_lib.llama_sample_classifier_free_guidance.argtypes = [ + llama_context_p, + llama_token_data_array_p, + llama_context_p, + c_float, + c_float, +] +_lib.llama_sample_classifier_free_guidance.restype = None + + # @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits. # LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates); def llama_sample_softmax( @@ -1063,5 +1106,5 @@ def llama_print_system_info() -> bytes: _llama_initialized = False if not _llama_initialized: - llama_init_backend(c_bool(False)) + llama_backend_init(c_bool(False)) _llama_initialized = True diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 1d1630996..32c541163 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 1d1630996920f889cdc08de26cebf2415958540e +Subproject commit 32c54116318929c90fd7ae814cf9b5232cd44c36 From de4cc5a233952e0dede642702f3170cd1bae5869 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 13 Jul 2023 23:25:12 -0400 Subject: [PATCH 37/48] bugfix: pydantic v2 fields --- llama_cpp/server/app.py | 108 +++++++++++++++++++--------------------- 1 file changed, 50 insertions(+), 58 deletions(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index ffd07fa6b..202a06ddc 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -31,9 +31,7 @@ class Settings(BaseSettings): ge=0, description="The number of layers to put on the GPU. The rest will be on the CPU.", ) - seed: int = Field( - default=1337, description="Random seed. -1 for random." - ) + seed: int = Field(default=1337, description="Random seed. -1 for random.") n_batch: int = Field( default=512, ge=1, description="The batch size to use per eval." ) @@ -80,12 +78,8 @@ class Settings(BaseSettings): verbose: bool = Field( default=True, description="Whether to print debug information." ) - host: str = Field( - default="localhost", description="Listen address" - ) - port: int = Field( - default=8000, description="Listen port" - ) + host: str = Field(default="localhost", description="Listen address") + port: int = Field(default=8000, description="Listen port") interrupt_requests: bool = Field( default=True, description="Whether to interrupt requests when a new request is received.", @@ -178,7 +172,7 @@ def get_settings(): yield settings -model_field = Field(description="The model to use for generating completions.") +model_field = Field(description="The model to use for generating completions.", default=None) max_tokens_field = Field( default=16, ge=1, le=2048, description="The maximum number of tokens to generate." @@ -242,21 +236,18 @@ def get_settings(): default=0, ge=0, le=2, - description="Enable Mirostat constant-perplexity algorithm of the specified version (1 or 2; 0 = disabled)" + description="Enable Mirostat constant-perplexity algorithm of the specified version (1 or 2; 0 = disabled)", ) mirostat_tau_field = Field( default=5.0, ge=0.0, le=10.0, - description="Mirostat target entropy, i.e. the target perplexity - lower values produce focused and coherent text, larger values produce more diverse and less coherent text" + description="Mirostat target entropy, i.e. the target perplexity - lower values produce focused and coherent text, larger values produce more diverse and less coherent text", ) mirostat_eta_field = Field( - default=0.1, - ge=0.001, - le=1.0, - description="Mirostat learning rate" + default=0.1, ge=0.001, le=1.0, description="Mirostat learning rate" ) @@ -294,22 +285,23 @@ class CreateCompletionRequest(BaseModel): model: Optional[str] = model_field n: Optional[int] = 1 best_of: Optional[int] = 1 - user: Optional[str] = Field(None) + user: Optional[str] = Field(default=None) # llama.cpp specific parameters top_k: int = top_k_field repeat_penalty: float = repeat_penalty_field logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None) - class Config: - schema_extra = { - "example": { - "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n", - "stop": ["\n", "###"], - } + model_config = { + "json_schema_extra": { + "examples": [ + { + "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n", + "stop": ["\n", "###"], + } + ] } - - + } def make_logit_bias_processor( @@ -328,7 +320,7 @@ def make_logit_bias_processor( elif logit_bias_type == "tokens": for token, score in logit_bias.items(): - token = token.encode('utf-8') + token = token.encode("utf-8") for input_id in llama.tokenize(token, add_bos=False): to_bias[input_id] = score @@ -352,7 +344,7 @@ async def create_completion( request: Request, body: CreateCompletionRequest, llama: llama_cpp.Llama = Depends(get_llama), -): +) -> llama_cpp.Completion: if isinstance(body.prompt, list): assert len(body.prompt) <= 1 body.prompt = body.prompt[0] if len(body.prompt) > 0 else "" @@ -364,7 +356,7 @@ async def create_completion( "logit_bias_type", "user", } - kwargs = body.dict(exclude=exclude) + kwargs = body.model_dump(exclude=exclude) if body.logit_bias is not None: kwargs['logits_processor'] = llama_cpp.LogitsProcessorList([ @@ -396,7 +388,7 @@ async def event_publisher(inner_send_chan: MemoryObjectSendStream): return EventSourceResponse( recv_chan, data_sender_callable=partial(event_publisher, send_chan) - ) + ) # type: ignore else: completion: llama_cpp.Completion = await run_in_threadpool(llama, **kwargs) # type: ignore return completion @@ -405,16 +397,17 @@ async def event_publisher(inner_send_chan: MemoryObjectSendStream): class CreateEmbeddingRequest(BaseModel): model: Optional[str] = model_field input: Union[str, List[str]] = Field(description="The input to embed.") - user: Optional[str] - - class Config: - schema_extra = { - "example": { - "input": "The food was delicious and the waiter...", - } + user: Optional[str] = Field(default=None) + + model_config = { + "json_schema_extra": { + "examples": [ + { + "input": "The food was delicious and the waiter...", + } + ] } - - + } @router.post( @@ -424,7 +417,7 @@ async def create_embedding( request: CreateEmbeddingRequest, llama: llama_cpp.Llama = Depends(get_llama) ): return await run_in_threadpool( - llama.create_embedding, **request.dict(exclude={"user"}) + llama.create_embedding, **request.model_dump(exclude={"user"}) ) @@ -461,21 +454,22 @@ class CreateChatCompletionRequest(BaseModel): repeat_penalty: float = repeat_penalty_field logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None) - class Config: - schema_extra = { - "example": { - "messages": [ - ChatCompletionRequestMessage( - role="system", content="You are a helpful assistant." - ), - ChatCompletionRequestMessage( - role="user", content="What is the capital of France?" - ), - ] - } + model_config = { + "json_schema_extra": { + "examples": [ + { + "messages": [ + ChatCompletionRequestMessage( + role="system", content="You are a helpful assistant." + ).model_dump(), + ChatCompletionRequestMessage( + role="user", content="What is the capital of France?" + ).model_dump(), + ] + } + ] } - - + } @router.post( @@ -486,14 +480,14 @@ async def create_chat_completion( body: CreateChatCompletionRequest, llama: llama_cpp.Llama = Depends(get_llama), settings: Settings = Depends(get_settings), -) -> Union[llama_cpp.ChatCompletion]: # type: ignore +) -> llama_cpp.ChatCompletion: exclude = { "n", "logit_bias", "logit_bias_type", "user", } - kwargs = body.dict(exclude=exclude) + kwargs = body.model_dump(exclude=exclude) if body.logit_bias is not None: kwargs['logits_processor'] = llama_cpp.LogitsProcessorList([ @@ -526,7 +520,7 @@ async def event_publisher(inner_send_chan: MemoryObjectSendStream): return EventSourceResponse( recv_chan, data_sender_callable=partial(event_publisher, send_chan), - ) + ) # type: ignore else: completion: llama_cpp.ChatCompletion = await run_in_threadpool( llama.create_chat_completion, **kwargs # type: ignore @@ -546,8 +540,6 @@ class ModelList(TypedDict): data: List[ModelData] - - @router.get("/v1/models") async def get_models( settings: Settings = Depends(get_settings), From 6705f9b6c6b3369481c4e2e0e15d0f1af7a96eff Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Thu, 13 Jul 2023 23:32:06 -0400 Subject: [PATCH 38/48] Bump version --- CHANGELOG.md | 10 ++++++++++ pyproject.toml | 2 +- setup.py | 2 +- 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8b5db37b0..47b55a73d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.1.71] + +### Added + +- (llama.cpp) Update llama.cpp + +### Fixed + +- (server) Fix several pydantic v2 migration bugs + ## [0.1.70] ### Fixed diff --git a/pyproject.toml b/pyproject.toml index a9e012e6b..1cff2318c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "llama_cpp_python" -version = "0.1.70" +version = "0.1.71" description = "Python bindings for the llama.cpp library" authors = ["Andrei Betlen "] license = "MIT" diff --git a/setup.py b/setup.py index ab5d825d4..71af72c44 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ description="A Python wrapper for llama.cpp", long_description=long_description, long_description_content_type="text/markdown", - version="0.1.70", + version="0.1.71", author="Andrei Betlen", author_email="abetlen@gmail.com", license="MIT", From e6c67c8f7d0e6cb85e27a4efb53569a6c304a344 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 14 Jul 2023 16:40:31 -0400 Subject: [PATCH 39/48] Update llama.cpp --- llama_cpp/llama_cpp.py | 74 ++++++++++++++++++++++++++++++++++++++++-- vendor/llama.cpp | 2 +- 2 files changed, 73 insertions(+), 3 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index b5bab56a3..04de04663 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -656,6 +656,22 @@ def llama_tokenize( _lib.llama_tokenize.restype = c_int +# LLAMA_API int llama_tokenize_with_model( +# const struct llama_model * model, +# const char * text, +# llama_token * tokens, +# int n_max_tokens, +# bool add_bos); +def llama_tokenize_with_model( + model: llama_model_p, + text: bytes, + tokens, # type: Array[llama_token] + n_max_tokens: c_int, + add_bos: c_bool, +) -> int: + return _lib.llama_tokenize_with_model(model, text, tokens, n_max_tokens, add_bos) + + # LLAMA_API int llama_n_vocab(const struct llama_context * ctx); def llama_n_vocab(ctx: llama_context_p) -> int: return _lib.llama_n_vocab(ctx) @@ -683,6 +699,33 @@ def llama_n_embd(ctx: llama_context_p) -> int: _lib.llama_n_embd.restype = c_int +# LLAMA_API int llama_n_vocab_from_model(const struct llama_model * model); +def llama_n_vocab_from_model(model: llama_model_p) -> int: + return _lib.llama_n_vocab_from_model(model) + + +_lib.llama_n_vocab_from_model.argtypes = [llama_model_p] +_lib.llama_n_vocab_from_model.restype = c_int + + +# LLAMA_API int llama_n_ctx_from_model (const struct llama_model * model); +def llama_n_ctx_from_model(model: llama_model_p) -> int: + return _lib.llama_n_ctx_from_model(model) + + +_lib.llama_n_ctx_from_model.argtypes = [llama_model_p] +_lib.llama_n_ctx_from_model.restype = c_int + + +# LLAMA_API int llama_n_embd_from_model (const struct llama_model * model); +def llama_n_embd_from_model(model: llama_model_p) -> int: + return _lib.llama_n_embd_from_model(model) + + +_lib.llama_n_embd_from_model.argtypes = [llama_model_p] +_lib.llama_n_embd_from_model.restype = c_int + + # // Get the vocabulary as output parameters. # // Returns number of results. # LLAMA_API int llama_get_vocab( @@ -703,6 +746,20 @@ def llama_get_vocab( _lib.llama_get_vocab.restype = c_int +# LLAMA_API int llama_get_vocab_from_model( +# const struct llama_model * model, +# const char * * strings, +# float * scores, +# int capacity); +def llama_get_vocab_from_model( + model: llama_model_p, + strings, # type: Array[c_char_p] # type: ignore + scores, # type: Array[c_float] # type: ignore + capacity: c_int, +) -> int: + return _lib.llama_get_vocab_from_model(model, strings, scores, capacity) + + # Token logits obtained from the last call to llama_eval() # The logits for the last token are stored in the last row # Can be mutated in order to change the probabilities of the next token @@ -732,8 +789,10 @@ def llama_get_embeddings( _lib.llama_get_embeddings.restype = c_float_p -# Token Id -> String. Uses the vocabulary in the provided context -# LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token); +# // Token Id -> String. Uses the vocabulary in the provided context +# LLAMA_API const char * llama_token_to_str( +# const struct llama_context * ctx, +# llama_token token); def llama_token_to_str(ctx: llama_context_p, token: llama_token) -> bytes: return _lib.llama_token_to_str(ctx, token) @@ -741,6 +800,17 @@ def llama_token_to_str(ctx: llama_context_p, token: llama_token) -> bytes: _lib.llama_token_to_str.argtypes = [llama_context_p, llama_token] _lib.llama_token_to_str.restype = c_char_p + +# LLAMA_API const char * llama_token_to_str_with_model( +# const struct llama_model * model, +# llama_token token); +def llama_token_to_str_with_model(model: llama_model_p, token: llama_token) -> bytes: + return _lib.llama_token_to_str_with_model(model, token) + + +_lib.llama_token_to_str_with_model.argtypes = [llama_model_p, llama_token] +_lib.llama_token_to_str_with_model.restype = c_char_p + # Special tokens diff --git a/vendor/llama.cpp b/vendor/llama.cpp index 32c541163..a6803cab9 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit 32c54116318929c90fd7ae814cf9b5232cd44c36 +Subproject commit a6803cab946c817fb7aaf2a40b317f5d3e373bd1 From 25b3494e11cc0a51bbfacc86353eabd9f1d6a147 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 14 Jul 2023 16:40:53 -0400 Subject: [PATCH 40/48] Minor fix to tensor_split parameter --- llama_cpp/llama.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index c5869edfd..849e7752e 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -207,7 +207,6 @@ def __init__( n_ctx: int = 512, n_parts: int = -1, n_gpu_layers: int = 0, - tensor_split: list[float] = None, seed: int = 1337, f16_kv: bool = True, logits_all: bool = False, @@ -221,6 +220,7 @@ def __init__( lora_base: Optional[str] = None, lora_path: Optional[str] = None, low_vram: bool = False, + tensor_split: Optional[List[float]] = None, verbose: bool = True, ): """Load a llama.cpp model from `model_path`. @@ -241,6 +241,7 @@ def __init__( last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque. lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model. lora_path: Path to a LoRA file to apply to the model. + tensor_split: List of floats to split the model across multiple GPUs. If None, the model is not split. verbose: Print verbose output to stderr. Raises: @@ -249,12 +250,6 @@ def __init__( Returns: A Llama instance. """ - if tensor_split is None: - tensor_split = [0.0] * llama_cpp.LLAMA_MAX_DEVICES.value - - #Type conversion and expand the list to the length of LLAMA_MAX_DEVICES - FloatArray = ctypes.c_float * llama_cpp.LLAMA_MAX_DEVICES.value - c_tensor_split = FloatArray(*tensor_split) self.verbose = verbose self.model_path = model_path @@ -262,7 +257,6 @@ def __init__( self.params = llama_cpp.llama_context_default_params() self.params.n_ctx = n_ctx self.params.n_gpu_layers = n_gpu_layers - self.params.tensor_split = c_tensor_split self.params.seed = seed self.params.f16_kv = f16_kv self.params.logits_all = logits_all @@ -272,6 +266,15 @@ def __init__( self.params.embedding = embedding self.params.low_vram = low_vram + self.tensor_split = tensor_split + self._c_tensor_split = None + + if self.tensor_split is not None: + #Type conversion and expand the list to the length of LLAMA_MAX_DEVICES + FloatArray = ctypes.c_float * llama_cpp.LLAMA_MAX_DEVICES.value + self._c_tensor_split = FloatArray(*tensor_split) # keep a reference to the array so it is not gc'd + self.params.tensor_split = self._c_tensor_split + self.last_n_tokens_size = last_n_tokens_size self.n_batch = min(n_ctx, n_batch) @@ -1499,7 +1502,6 @@ def __getstate__(self): model_path=self.model_path, n_ctx=self.params.n_ctx, n_gpu_layers=self.params.n_gpu_layers, - tensor_split=self.params.tensor_split, seed=self.params.seed, f16_kv=self.params.f16_kv, logits_all=self.params.logits_all, @@ -1513,6 +1515,7 @@ def __getstate__(self): n_threads=self.n_threads, lora_base=self.lora_base, lora_path=self.lora_path, + tensor_split=self.tensor_split, ### DEPRECATED ### n_parts=self.n_parts, ### DEPRECATED ### @@ -1524,7 +1527,6 @@ def __setstate__(self, state): n_ctx=state["n_ctx"], n_parts=state["n_parts"], n_gpu_layers=state["n_gpu_layers"], - tensor_split=state["tensor_split"], seed=state["seed"], f16_kv=state["f16_kv"], logits_all=state["logits_all"], @@ -1538,6 +1540,7 @@ def __setstate__(self, state): last_n_tokens_size=state["last_n_tokens_size"], lora_base=state["lora_base"], lora_path=state["lora_path"], + tensor_split=state["tensor_split"], verbose=state["verbose"], ) From 118b7f6d5c2cacab8d2c4a4c2d44b6a4eda03b37 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Fri, 14 Jul 2023 16:52:48 -0400 Subject: [PATCH 41/48] fix: tensor_split should be optional list --- llama_cpp/server/app.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index 8dc5a0f03..eaa6f44a9 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -31,7 +31,7 @@ class Settings(BaseSettings): ge=0, description="The number of layers to put on the GPU. The rest will be on the CPU.", ) - tensor_split: List[float] = Field( + tensor_split: Optional[List[float]] = Field( default=None, description="Split layers across multiple GPUs in proportion.", ) From f72b6e9b732396ac29fb44e5cc43d4743d1b6fd9 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 15 Jul 2023 15:01:08 -0400 Subject: [PATCH 42/48] Update llama.cpp --- llama_cpp/llama_cpp.py | 8 +++++++- vendor/llama.cpp | 2 +- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 04de04663..aef4f65c7 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -165,12 +165,16 @@ class llama_token_data_array(Structure): # int32_t n_gpu_layers; // number of layers to store in VRAM # int32_t main_gpu; // the GPU that is used for scratch and small tensors # float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs + +# // ref: https://github.com/ggerganov/llama.cpp/pull/2054 +# float rope_freq_base; // RoPE base frequency +# float rope_freq_scale; // RoPE frequency scaling factor + # // called with a progress value between 0 and 1, pass NULL to disable # llama_progress_callback progress_callback; # // context pointer passed to the progress callback # void * progress_callback_user_data; - # // Keep the booleans together to avoid misalignment during copy-by-value. # bool low_vram; // if true, reduce VRAM usage at the cost of performance # bool f16_kv; // use fp16 for KV cache @@ -188,6 +192,8 @@ class llama_context_params(Structure): ("n_gpu_layers", c_int32), ("main_gpu", c_int32), ("tensor_split", c_float * LLAMA_MAX_DEVICES.value), + ("rope_freq_base", c_float), + ("rope_freq_scale", c_float), ("progress_callback", llama_progress_callback), ("progress_callback_user_data", c_void_p), ("low_vram", c_bool), diff --git a/vendor/llama.cpp b/vendor/llama.cpp index a6803cab9..6e7cca404 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit a6803cab946c817fb7aaf2a40b317f5d3e373bd1 +Subproject commit 6e7cca404748dd4b1a3affd0d1296e37f4ac0a6f From f0797a6054d97530663f5831ef498f45ceeda113 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 15 Jul 2023 15:11:01 -0400 Subject: [PATCH 43/48] Merge branch main into custom_rope --- CHANGELOG.md | 10 +++ README.md | 1 + llama_cpp/llama.py | 16 ++++- llama_cpp/llama_cpp.py | 137 +++++++++++++++++++++++++++++++++++++--- llama_cpp/server/app.py | 109 ++++++++++++++++---------------- pyproject.toml | 2 +- setup.py | 4 +- vendor/llama.cpp | 2 +- 8 files changed, 212 insertions(+), 69 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8b5db37b0..47b55a73d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.1.71] + +### Added + +- (llama.cpp) Update llama.cpp + +### Fixed + +- (server) Fix several pydantic v2 migration bugs + ## [0.1.70] ### Fixed diff --git a/README.md b/README.md index 0322c73a3..1f3dcb5ab 100644 --- a/README.md +++ b/README.md @@ -135,6 +135,7 @@ A Docker image is available on [GHCR](https://ghcr.io/abetlen/llama-cpp-python). ```bash docker run --rm -it -p 8000:8000 -v /path/to/models:/models -e MODEL=/models/ggml-model-name.bin ghcr.io/abetlen/llama-cpp-python:latest ``` +[Docker on termux (requires root)](https://gist.github.com/FreddieOliveira/efe850df7ff3951cb62d74bd770dce27) is currently the only known way to run this on phones, see [termux support issue](https://github.com/abetlen/llama-cpp-python/issues/389) ## Low-level API diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index ada6d695b..7bda0461b 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -19,6 +19,7 @@ from collections import deque, OrderedDict import diskcache +import ctypes from . import llama_cpp from .llama_types import * @@ -26,7 +27,6 @@ import numpy as np import numpy.typing as npt - class BaseLlamaCache(ABC): """Base cache class for a llama.cpp model.""" @@ -222,6 +222,7 @@ def __init__( lora_base: Optional[str] = None, lora_path: Optional[str] = None, low_vram: bool = False, + tensor_split: Optional[List[float]] = None, verbose: bool = True, ): """Load a llama.cpp model from `model_path`. @@ -244,6 +245,7 @@ def __init__( last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque. lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model. lora_path: Path to a LoRA file to apply to the model. + tensor_split: List of floats to split the model across multiple GPUs. If None, the model is not split. verbose: Print verbose output to stderr. Raises: @@ -252,6 +254,7 @@ def __init__( Returns: A Llama instance. """ + self.verbose = verbose self.model_path = model_path @@ -269,6 +272,15 @@ def __init__( self.params.embedding = embedding self.params.low_vram = low_vram + self.tensor_split = tensor_split + self._c_tensor_split = None + + if self.tensor_split is not None: + #Type conversion and expand the list to the length of LLAMA_MAX_DEVICES + FloatArray = ctypes.c_float * llama_cpp.LLAMA_MAX_DEVICES.value + self._c_tensor_split = FloatArray(*tensor_split) # keep a reference to the array so it is not gc'd + self.params.tensor_split = self._c_tensor_split + self.last_n_tokens_size = last_n_tokens_size self.n_batch = min(n_ctx, n_batch) @@ -1509,6 +1521,7 @@ def __getstate__(self): n_threads=self.n_threads, lora_base=self.lora_base, lora_path=self.lora_path, + tensor_split=self.tensor_split, ### DEPRECATED ### n_parts=self.n_parts, ### DEPRECATED ### @@ -1533,6 +1546,7 @@ def __setstate__(self, state): last_n_tokens_size=state["last_n_tokens_size"], lora_base=state["lora_base"], lora_path=state["lora_path"], + tensor_split=state["tensor_split"], verbose=state["verbose"], ) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 320c48b9c..32f70f005 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -165,12 +165,16 @@ class llama_token_data_array(Structure): # int32_t n_gpu_layers; // number of layers to store in VRAM # int32_t main_gpu; // the GPU that is used for scratch and small tensors # float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs + +# // ref: https://github.com/ggerganov/llama.cpp/pull/2054 +# float rope_freq_base; // RoPE base frequency +# float rope_freq_scale; // RoPE frequency scaling factor + # // called with a progress value between 0 and 1, pass NULL to disable # llama_progress_callback progress_callback; # // context pointer passed to the progress callback # void * progress_callback_user_data; - # // Keep the booleans together to avoid misalignment during copy-by-value. # bool low_vram; // if true, reduce VRAM usage at the cost of performance # bool f16_kv; // use fp16 for KV cache @@ -190,6 +194,8 @@ class llama_context_params(Structure): ("n_gpu_layers", c_int32), ("main_gpu", c_int32), ("tensor_split", c_float * LLAMA_MAX_DEVICES.value), + ("rope_freq_base", c_float), + ("rope_freq_scale", c_float), ("progress_callback", llama_progress_callback), ("progress_callback_user_data", c_void_p), ("low_vram", c_bool), @@ -328,13 +334,23 @@ def llama_mlock_supported() -> bool: # // Initialize the llama + ggml backend # // If numa is true, use NUMA optimizations # // Call once at the start of the program -# LLAMA_API void llama_init_backend(bool numa); -def llama_init_backend(numa: c_bool): - return _lib.llama_init_backend(numa) +# LLAMA_API void llama_backend_init(bool numa); +def llama_backend_init(numa: c_bool): + return _lib.llama_backend_init(numa) + + +_lib.llama_backend_init.argtypes = [c_bool] +_lib.llama_backend_init.restype = None + +# // Call once at the end of the program - currently only used for MPI +# LLAMA_API void llama_backend_free(); +def llama_backend_free(): + return _lib.llama_backend_free() -_lib.llama_init_backend.argtypes = [c_bool] -_lib.llama_init_backend.restype = None + +_lib.llama_backend_free.argtypes = [] +_lib.llama_backend_free.restype = None # LLAMA_API struct llama_model * llama_load_model_from_file( @@ -648,6 +664,22 @@ def llama_tokenize( _lib.llama_tokenize.restype = c_int +# LLAMA_API int llama_tokenize_with_model( +# const struct llama_model * model, +# const char * text, +# llama_token * tokens, +# int n_max_tokens, +# bool add_bos); +def llama_tokenize_with_model( + model: llama_model_p, + text: bytes, + tokens, # type: Array[llama_token] + n_max_tokens: c_int, + add_bos: c_bool, +) -> int: + return _lib.llama_tokenize_with_model(model, text, tokens, n_max_tokens, add_bos) + + # LLAMA_API int llama_n_vocab(const struct llama_context * ctx); def llama_n_vocab(ctx: llama_context_p) -> int: return _lib.llama_n_vocab(ctx) @@ -675,6 +707,33 @@ def llama_n_embd(ctx: llama_context_p) -> int: _lib.llama_n_embd.restype = c_int +# LLAMA_API int llama_n_vocab_from_model(const struct llama_model * model); +def llama_n_vocab_from_model(model: llama_model_p) -> int: + return _lib.llama_n_vocab_from_model(model) + + +_lib.llama_n_vocab_from_model.argtypes = [llama_model_p] +_lib.llama_n_vocab_from_model.restype = c_int + + +# LLAMA_API int llama_n_ctx_from_model (const struct llama_model * model); +def llama_n_ctx_from_model(model: llama_model_p) -> int: + return _lib.llama_n_ctx_from_model(model) + + +_lib.llama_n_ctx_from_model.argtypes = [llama_model_p] +_lib.llama_n_ctx_from_model.restype = c_int + + +# LLAMA_API int llama_n_embd_from_model (const struct llama_model * model); +def llama_n_embd_from_model(model: llama_model_p) -> int: + return _lib.llama_n_embd_from_model(model) + + +_lib.llama_n_embd_from_model.argtypes = [llama_model_p] +_lib.llama_n_embd_from_model.restype = c_int + + # // Get the vocabulary as output parameters. # // Returns number of results. # LLAMA_API int llama_get_vocab( @@ -695,6 +754,20 @@ def llama_get_vocab( _lib.llama_get_vocab.restype = c_int +# LLAMA_API int llama_get_vocab_from_model( +# const struct llama_model * model, +# const char * * strings, +# float * scores, +# int capacity); +def llama_get_vocab_from_model( + model: llama_model_p, + strings, # type: Array[c_char_p] # type: ignore + scores, # type: Array[c_float] # type: ignore + capacity: c_int, +) -> int: + return _lib.llama_get_vocab_from_model(model, strings, scores, capacity) + + # Token logits obtained from the last call to llama_eval() # The logits for the last token are stored in the last row # Can be mutated in order to change the probabilities of the next token @@ -724,8 +797,10 @@ def llama_get_embeddings( _lib.llama_get_embeddings.restype = c_float_p -# Token Id -> String. Uses the vocabulary in the provided context -# LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token); +# // Token Id -> String. Uses the vocabulary in the provided context +# LLAMA_API const char * llama_token_to_str( +# const struct llama_context * ctx, +# llama_token token); def llama_token_to_str(ctx: llama_context_p, token: llama_token) -> bytes: return _lib.llama_token_to_str(ctx, token) @@ -733,6 +808,17 @@ def llama_token_to_str(ctx: llama_context_p, token: llama_token) -> bytes: _lib.llama_token_to_str.argtypes = [llama_context_p, llama_token] _lib.llama_token_to_str.restype = c_char_p + +# LLAMA_API const char * llama_token_to_str_with_model( +# const struct llama_model * model, +# llama_token token); +def llama_token_to_str_with_model(model: llama_model_p, token: llama_token) -> bytes: + return _lib.llama_token_to_str_with_model(model, token) + + +_lib.llama_token_to_str_with_model.argtypes = [llama_model_p, llama_token] +_lib.llama_token_to_str_with_model.restype = c_char_p + # Special tokens @@ -821,6 +907,39 @@ def llama_sample_frequency_and_presence_penalties( _lib.llama_sample_frequency_and_presence_penalties.restype = None +# /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806 +# /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted. +# /// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context. +# /// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance. +# /// @params smooth_factor Smooth factor between guidance logits and original logits. 1.0f means only use guidance logits. 0.0f means only original logits. +# LLAMA_API void llama_sample_classifier_free_guidance( +# struct llama_context * ctx, +# llama_token_data_array * candidates, +# struct llama_context * guidance_ctx, +# float scale, +# float smooth_factor); +def llama_sample_classifier_free_guidance( + ctx: llama_context_p, + candidates, # type: _Pointer[llama_token_data_array] + guidance_ctx: llama_context_p, + scale: c_float, + smooth_factor: c_float, +): + return _lib.llama_sample_classifier_free_guidance( + ctx, candidates, guidance_ctx, scale, smooth_factor + ) + + +_lib.llama_sample_classifier_free_guidance.argtypes = [ + llama_context_p, + llama_token_data_array_p, + llama_context_p, + c_float, + c_float, +] +_lib.llama_sample_classifier_free_guidance.restype = None + + # @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits. # LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates); def llama_sample_softmax( @@ -1065,5 +1184,5 @@ def llama_print_system_info() -> bytes: _llama_initialized = False if not _llama_initialized: - llama_init_backend(c_bool(False)) + llama_backend_init(c_bool(False)) _llama_initialized = True diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py index ffd07fa6b..eaa6f44a9 100644 --- a/llama_cpp/server/app.py +++ b/llama_cpp/server/app.py @@ -31,6 +31,10 @@ class Settings(BaseSettings): ge=0, description="The number of layers to put on the GPU. The rest will be on the CPU.", ) + tensor_split: Optional[List[float]] = Field( + default=None, + description="Split layers across multiple GPUs in proportion.", + ) seed: int = Field( default=1337, description="Random seed. -1 for random." ) @@ -80,12 +84,8 @@ class Settings(BaseSettings): verbose: bool = Field( default=True, description="Whether to print debug information." ) - host: str = Field( - default="localhost", description="Listen address" - ) - port: int = Field( - default=8000, description="Listen port" - ) + host: str = Field(default="localhost", description="Listen address") + port: int = Field(default=8000, description="Listen port") interrupt_requests: bool = Field( default=True, description="Whether to interrupt requests when a new request is received.", @@ -117,6 +117,7 @@ def create_app(settings: Optional[Settings] = None): llama = llama_cpp.Llama( model_path=settings.model, n_gpu_layers=settings.n_gpu_layers, + tensor_split=settings.tensor_split, seed=settings.seed, f16_kv=settings.f16_kv, use_mlock=settings.use_mlock, @@ -178,7 +179,7 @@ def get_settings(): yield settings -model_field = Field(description="The model to use for generating completions.") +model_field = Field(description="The model to use for generating completions.", default=None) max_tokens_field = Field( default=16, ge=1, le=2048, description="The maximum number of tokens to generate." @@ -242,21 +243,18 @@ def get_settings(): default=0, ge=0, le=2, - description="Enable Mirostat constant-perplexity algorithm of the specified version (1 or 2; 0 = disabled)" + description="Enable Mirostat constant-perplexity algorithm of the specified version (1 or 2; 0 = disabled)", ) mirostat_tau_field = Field( default=5.0, ge=0.0, le=10.0, - description="Mirostat target entropy, i.e. the target perplexity - lower values produce focused and coherent text, larger values produce more diverse and less coherent text" + description="Mirostat target entropy, i.e. the target perplexity - lower values produce focused and coherent text, larger values produce more diverse and less coherent text", ) mirostat_eta_field = Field( - default=0.1, - ge=0.001, - le=1.0, - description="Mirostat learning rate" + default=0.1, ge=0.001, le=1.0, description="Mirostat learning rate" ) @@ -294,22 +292,23 @@ class CreateCompletionRequest(BaseModel): model: Optional[str] = model_field n: Optional[int] = 1 best_of: Optional[int] = 1 - user: Optional[str] = Field(None) + user: Optional[str] = Field(default=None) # llama.cpp specific parameters top_k: int = top_k_field repeat_penalty: float = repeat_penalty_field logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None) - class Config: - schema_extra = { - "example": { - "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n", - "stop": ["\n", "###"], - } + model_config = { + "json_schema_extra": { + "examples": [ + { + "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n", + "stop": ["\n", "###"], + } + ] } - - + } def make_logit_bias_processor( @@ -328,7 +327,7 @@ def make_logit_bias_processor( elif logit_bias_type == "tokens": for token, score in logit_bias.items(): - token = token.encode('utf-8') + token = token.encode("utf-8") for input_id in llama.tokenize(token, add_bos=False): to_bias[input_id] = score @@ -352,7 +351,7 @@ async def create_completion( request: Request, body: CreateCompletionRequest, llama: llama_cpp.Llama = Depends(get_llama), -): +) -> llama_cpp.Completion: if isinstance(body.prompt, list): assert len(body.prompt) <= 1 body.prompt = body.prompt[0] if len(body.prompt) > 0 else "" @@ -364,7 +363,7 @@ async def create_completion( "logit_bias_type", "user", } - kwargs = body.dict(exclude=exclude) + kwargs = body.model_dump(exclude=exclude) if body.logit_bias is not None: kwargs['logits_processor'] = llama_cpp.LogitsProcessorList([ @@ -396,7 +395,7 @@ async def event_publisher(inner_send_chan: MemoryObjectSendStream): return EventSourceResponse( recv_chan, data_sender_callable=partial(event_publisher, send_chan) - ) + ) # type: ignore else: completion: llama_cpp.Completion = await run_in_threadpool(llama, **kwargs) # type: ignore return completion @@ -405,16 +404,17 @@ async def event_publisher(inner_send_chan: MemoryObjectSendStream): class CreateEmbeddingRequest(BaseModel): model: Optional[str] = model_field input: Union[str, List[str]] = Field(description="The input to embed.") - user: Optional[str] - - class Config: - schema_extra = { - "example": { - "input": "The food was delicious and the waiter...", - } + user: Optional[str] = Field(default=None) + + model_config = { + "json_schema_extra": { + "examples": [ + { + "input": "The food was delicious and the waiter...", + } + ] } - - + } @router.post( @@ -424,7 +424,7 @@ async def create_embedding( request: CreateEmbeddingRequest, llama: llama_cpp.Llama = Depends(get_llama) ): return await run_in_threadpool( - llama.create_embedding, **request.dict(exclude={"user"}) + llama.create_embedding, **request.model_dump(exclude={"user"}) ) @@ -461,21 +461,22 @@ class CreateChatCompletionRequest(BaseModel): repeat_penalty: float = repeat_penalty_field logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None) - class Config: - schema_extra = { - "example": { - "messages": [ - ChatCompletionRequestMessage( - role="system", content="You are a helpful assistant." - ), - ChatCompletionRequestMessage( - role="user", content="What is the capital of France?" - ), - ] - } + model_config = { + "json_schema_extra": { + "examples": [ + { + "messages": [ + ChatCompletionRequestMessage( + role="system", content="You are a helpful assistant." + ).model_dump(), + ChatCompletionRequestMessage( + role="user", content="What is the capital of France?" + ).model_dump(), + ] + } + ] } - - + } @router.post( @@ -486,14 +487,14 @@ async def create_chat_completion( body: CreateChatCompletionRequest, llama: llama_cpp.Llama = Depends(get_llama), settings: Settings = Depends(get_settings), -) -> Union[llama_cpp.ChatCompletion]: # type: ignore +) -> llama_cpp.ChatCompletion: exclude = { "n", "logit_bias", "logit_bias_type", "user", } - kwargs = body.dict(exclude=exclude) + kwargs = body.model_dump(exclude=exclude) if body.logit_bias is not None: kwargs['logits_processor'] = llama_cpp.LogitsProcessorList([ @@ -526,7 +527,7 @@ async def event_publisher(inner_send_chan: MemoryObjectSendStream): return EventSourceResponse( recv_chan, data_sender_callable=partial(event_publisher, send_chan), - ) + ) # type: ignore else: completion: llama_cpp.ChatCompletion = await run_in_threadpool( llama.create_chat_completion, **kwargs # type: ignore @@ -546,8 +547,6 @@ class ModelList(TypedDict): data: List[ModelData] - - @router.get("/v1/models") async def get_models( settings: Settings = Depends(get_settings), diff --git a/pyproject.toml b/pyproject.toml index a9e012e6b..1cff2318c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "llama_cpp_python" -version = "0.1.70" +version = "0.1.71" description = "Python bindings for the llama.cpp library" authors = ["Andrei Betlen "] license = "MIT" diff --git a/setup.py b/setup.py index b8acedb5a..71af72c44 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ description="A Python wrapper for llama.cpp", long_description=long_description, long_description_content_type="text/markdown", - version="0.1.70", + version="0.1.71", author="Andrei Betlen", author_email="abetlen@gmail.com", license="MIT", @@ -18,7 +18,7 @@ packages=["llama_cpp", "llama_cpp.server"], install_requires=["typing-extensions>=4.5.0", "numpy>=1.20.0", "diskcache>=5.6.1"], extras_require={ - "server": ["uvicorn>=0.22.1", "fastapi>=0.100.0", "pydantic-settings>=2.0.1", "sse-starlette>=1.6.1"], + "server": ["uvicorn>=0.22.0", "fastapi>=0.100.0", "pydantic-settings>=2.0.1", "sse-starlette>=1.6.1"], }, python_requires=">=3.7", classifiers=[ diff --git a/vendor/llama.cpp b/vendor/llama.cpp index a3b4d9328..6e7cca404 160000 --- a/vendor/llama.cpp +++ b/vendor/llama.cpp @@ -1 +1 @@ -Subproject commit a3b4d932859f4e51ed716bfa1f07e2d2eede2c23 +Subproject commit 6e7cca404748dd4b1a3affd0d1296e37f4ac0a6f From bdf32df255104df1b45453431e9fa19a03220a8a Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 15 Jul 2023 15:34:32 -0400 Subject: [PATCH 44/48] Add additional direnv directory to gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 36ed7f7fd..3866fb251 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ _skbuild/ .envrc +.direnv models/ From e4f9db37db5dca97f22ec53e169ea047e23462c0 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 15 Jul 2023 15:34:55 -0400 Subject: [PATCH 45/48] Fix context_params struct layout --- llama_cpp/llama_cpp.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py index 32f70f005..aef4f65c7 100644 --- a/llama_cpp/llama_cpp.py +++ b/llama_cpp/llama_cpp.py @@ -188,8 +188,6 @@ class llama_context_params(Structure): _fields_ = [ ("seed", c_uint32), ("n_ctx", c_int32), - ("rope_freq_base", c_float), - ("rope_freq_scale", c_float), ("n_batch", c_int32), ("n_gpu_layers", c_int32), ("main_gpu", c_int32), From 8ab098e49dfcc5a4afbcb2f11d54efa2cb606972 Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 15 Jul 2023 15:35:08 -0400 Subject: [PATCH 46/48] Re-order Llama class params --- llama_cpp/llama.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 7bda0461b..92ca67d65 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -205,8 +205,6 @@ def __init__( model_path: str, # NOTE: These parameters are likely to change in the future. n_ctx: int = 512, - rope_freq_base: float = 10000.0, - rope_freq_scale: float = 1.0, n_parts: int = -1, n_gpu_layers: int = 0, seed: int = 1337, @@ -223,6 +221,8 @@ def __init__( lora_path: Optional[str] = None, low_vram: bool = False, tensor_split: Optional[List[float]] = None, + rope_freq_base: float = 10000.0, + rope_freq_scale: float = 1.0, verbose: bool = True, ): """Load a llama.cpp model from `model_path`. @@ -230,8 +230,6 @@ def __init__( Args: model_path: Path to the model. n_ctx: Maximum context size. - rope_freq_base: RoPE base frequency. - rope_freq_scale: RoPE frequency scale. n_parts: Number of parts to split the model into. If -1, the number of parts is automatically determined. seed: Random seed. -1 for random. f16_kv: Use half-precision for key/value cache. @@ -246,6 +244,8 @@ def __init__( lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model. lora_path: Path to a LoRA file to apply to the model. tensor_split: List of floats to split the model across multiple GPUs. If None, the model is not split. + rope_freq_base: Base frequency for rope sampling. + rope_freq_scale: Scale factor for rope sampling. verbose: Print verbose output to stderr. Raises: @@ -260,8 +260,6 @@ def __init__( self.params = llama_cpp.llama_context_default_params() self.params.n_ctx = n_ctx - self.params.rope_freq_base = rope_freq_base - self.params.rope_freq_scale = rope_freq_scale self.params.n_gpu_layers = n_gpu_layers self.params.seed = seed self.params.f16_kv = f16_kv @@ -281,6 +279,9 @@ def __init__( self._c_tensor_split = FloatArray(*tensor_split) # keep a reference to the array so it is not gc'd self.params.tensor_split = self._c_tensor_split + self.params.rope_freq_base = rope_freq_base + self.params.rope_freq_scale = rope_freq_scale + self.last_n_tokens_size = last_n_tokens_size self.n_batch = min(n_ctx, n_batch) From 6d8892fe64ca7eadd503ae01f93fbcd9ff3806dd Mon Sep 17 00:00:00 2001 From: Andrei Betlen Date: Sat, 15 Jul 2023 17:13:55 -0400 Subject: [PATCH 47/48] Bump version --- CHANGELOG.md | 6 ++++++ pyproject.toml | 2 +- setup.py | 2 +- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 47b55a73d..c7723c529 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.1.72] + +### Added + +- (llama.cpp) Update llama.cpp added custom_rope for extended context lengths + ## [0.1.71] ### Added diff --git a/pyproject.toml b/pyproject.toml index 1cff2318c..7839a869a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "llama_cpp_python" -version = "0.1.71" +version = "0.1.72" description = "Python bindings for the llama.cpp library" authors = ["Andrei Betlen "] license = "MIT" diff --git a/setup.py b/setup.py index 71af72c44..9b4de9785 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ description="A Python wrapper for llama.cpp", long_description=long_description, long_description_content_type="text/markdown", - version="0.1.71", + version="0.1.72", author="Andrei Betlen", author_email="abetlen@gmail.com", license="MIT", From c6fb8764e6daa42fee8659b6e169ec40b289efeb Mon Sep 17 00:00:00 2001 From: Mozer Date: Sun, 16 Jul 2023 13:41:56 +0300 Subject: [PATCH 48/48] Update llama.py --- llama_cpp/llama.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py index 92ca67d65..ed27476e7 100644 --- a/llama_cpp/llama.py +++ b/llama_cpp/llama.py @@ -221,8 +221,8 @@ def __init__( lora_path: Optional[str] = None, low_vram: bool = False, tensor_split: Optional[List[float]] = None, - rope_freq_base: float = 10000.0, - rope_freq_scale: float = 1.0, + rope_freq_base: float = 80000.0, + rope_freq_scale: float = 0.5, verbose: bool = True, ): """Load a llama.cpp model from `model_path`.