From 90e102115419453d75ba59c1d35ef23b8bbcea6e Mon Sep 17 00:00:00 2001
From: jm12138 <2286040843@qq.com>
Date: Mon, 10 Apr 2023 15:56:05 +0000
Subject: [PATCH 01/48] Add unlimited max_tokens

---
 llama_cpp/llama.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 2d76ec402..880e42d66 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -317,7 +317,15 @@ def _create_completion(
         if self.verbose:
             llama_cpp.llama_reset_timings(self.ctx)
 
-        if len(prompt_tokens) + max_tokens > int(llama_cpp.llama_n_ctx(self.ctx)):
+        if max_tokens <= 0:
+            # Unlimited, depending on n_ctx.
+            if len(prompt_tokens) >= int(llama_cpp.llama_n_ctx(self.ctx)):
+                raise ValueError(
+                    f"Requested tokens exceed context window of {llama_cpp.llama_n_ctx(self.ctx)}"
+                )
+            else:
+                max_tokens = int(llama_cpp.llama_n_ctx(self.ctx)) - len(prompt_tokens)
+        elif len(prompt_tokens) + max_tokens > int(llama_cpp.llama_n_ctx(self.ctx)):
             raise ValueError(
                 f"Requested tokens exceed context window of {llama_cpp.llama_n_ctx(self.ctx)}"
             )
@@ -455,7 +463,7 @@ def create_completion(
         Args:
             prompt: The prompt to generate text from.
             suffix: A suffix to append to the generated text. If None, no suffix is appended.
-            max_tokens: The maximum number of tokens to generate.
+            max_tokens: The maximum number of tokens to generate. If max_tokens <= 0, the maximum number of tokens to generate is unlimited and depends on n_ctx.
             temperature: The temperature to use for sampling.
             top_p: The top-p value to use for sampling.
             logprobs: The number of logprobs to return. If None, no logprobs are returned.
@@ -510,7 +518,7 @@ def __call__(
         Args:
             prompt: The prompt to generate text from.
             suffix: A suffix to append to the generated text. If None, no suffix is appended.
-            max_tokens: The maximum number of tokens to generate.
+            max_tokens: The maximum number of tokens to generate. If max_tokens <= 0, the maximum number of tokens to generate is unlimited and depends on n_ctx.
             temperature: The temperature to use for sampling.
             top_p: The top-p value to use for sampling.
             logprobs: The number of logprobs to return. If None, no logprobs are returned.
@@ -619,7 +627,7 @@ def create_chat_completion(
             top_k: The top-k value to use for sampling.
             stream: Whether to stream the results.
             stop: A list of strings to stop generation when encountered.
-            max_tokens: The maximum number of tokens to generate.
+            max_tokens: The maximum number of tokens to generate. If max_tokens <= 0, the maximum number of tokens to generate is unlimited and depends on n_ctx.
             repeat_penalty: The penalty to apply to repeated tokens.
 
         Returns:

From 036548365f16d05a638fe7c75a0d60fe452e954f Mon Sep 17 00:00:00 2001
From: SubhranshuSharma <subhranshusharma7@gmail.com>
Date: Sat, 17 Jun 2023 14:50:07 +0530
Subject: [PATCH 02/48] added termux with root instructions

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 0e62f3dfa..80c63f567 100644
--- a/README.md
+++ b/README.md
@@ -126,6 +126,7 @@ A Docker image is available on [GHCR](https://ghcr.io/abetlen/llama-cpp-python).
 ```bash
 docker run --rm -it -p 8000:8000 -v /path/to/models:/models -e MODEL=/models/ggml-model-name.bin ghcr.io/abetlen/llama-cpp-python:latest
 ```
+[Docker on termux (requires root)](https://gist.github.com/FreddieOliveira/efe850df7ff3951cb62d74bd770dce27) is currently the only known way to run this on phones, see [termux support issue](https://github.com/abetlen/llama-cpp-python/issues/389) 
 
 ## Low-level API
 

From 98ae4e58a3adce4b3cf775121ee1f1ac2ce5ddb6 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Thu, 6 Jul 2023 17:57:56 -0400
Subject: [PATCH 03/48] Update llama.cpp

---
 Makefile               |  3 +++
 llama_cpp/llama_cpp.py | 39 +++++++++++++++++++++++++++++++++++++++
 vendor/llama.cpp       |  2 +-
 3 files changed, 43 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 66d93f3a2..1be35cf8f 100644
--- a/Makefile
+++ b/Makefile
@@ -33,6 +33,9 @@ deploy.gh-docs:
 	mkdocs build
 	mkdocs gh-deploy
 
+test:
+	python3 -m pytest
+
 clean:
 	- cd vendor/llama.cpp && make clean
 	- cd vendor/llama.cpp && rm libllama.so
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index c68fb18d1..17c631961 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -2,6 +2,7 @@
 import os
 import ctypes
 from ctypes import (
+    c_double,
     c_int,
     c_float,
     c_char_p,
@@ -169,6 +170,7 @@ class llama_token_data_array(Structure):
 #     // context pointer passed to the progress callback
 #     void * progress_callback_user_data;
 
+
 #     // Keep the booleans together to avoid misalignment during copy-by-value.
 #     bool low_vram;   // if true, reduce VRAM usage at the cost of performance
 #     bool f16_kv;     // use fp16 for KV cache
@@ -256,6 +258,34 @@ class llama_model_quantize_params(Structure):
     ]
 
 
+# // performance timing information
+# struct llama_timings {
+#     double t_start_ms;
+#     double t_end_ms;
+#     double t_load_ms;
+#     double t_sample_ms;
+#     double t_p_eval_ms;
+#     double t_eval_ms;
+
+
+#     int32_t n_sample;
+#     int32_t n_p_eval;
+#     int32_t n_eval;
+# };
+class llama_timings(Structure):
+    _fields_ = [
+        ("t_start_ms", c_double),
+        ("t_end_ms", c_double),
+        ("t_load_ms", c_double),
+        ("t_sample_ms", c_double),
+        ("t_p_eval_ms", c_double),
+        ("t_eval_ms", c_double),
+        ("n_sample", c_int32),
+        ("n_p_eval", c_int32),
+        ("n_eval", c_int32),
+    ]
+
+
 # LLAMA_API struct llama_context_params llama_context_default_params();
 def llama_context_default_params() -> llama_context_params:
     return _lib.llama_context_default_params()
@@ -991,6 +1021,15 @@ def llama_sample_token(
 # Performance information
 
 
+# LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
+def llama_get_timings(ctx: llama_context_p) -> llama_timings:
+    return _lib.llama_get_timings(ctx)
+
+
+_lib.llama_get_timings.argtypes = [llama_context_p]
+_lib.llama_get_timings.restype = llama_timings
+
+
 # LLAMA_API void llama_print_timings(struct llama_context * ctx);
 def llama_print_timings(ctx: llama_context_p):
     _lib.llama_print_timings(ctx)
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 7f0e9a775..dfd9fce6d 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 7f0e9a775ecc4c6ade271c217f63d6dc93e79eaa
+Subproject commit dfd9fce6d65599bf33df43e616e85aa639bdae4c

From 4c7cdcca00f63896a95e09a11f424237e224bc72 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 7 Jul 2023 03:04:17 -0400
Subject: [PATCH 04/48] Add interruptible streaming requests for
 llama-cpp-python server. Closes #183

---
 CHANGELOG.md            |  4 ++++
 llama_cpp/server/app.py | 31 +++++++++++++++++++++++++------
 2 files changed, 29 insertions(+), 6 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index c6cfaab28..11251c6df 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [Added]
+
+- (server) Streaming requests can are now interrupted pre-maturely when a concurrent request is made. Can be controlled with the `interrupt_requests` setting.
+
 ## [0.1.68]
 
 ## [Added]
diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index ef319c7e0..b9d57717e 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -146,12 +146,27 @@ def set_settings(_settings: Settings):
     return app
 
 
-llama_lock = Lock()
+llama_outer_lock = Lock()
+llama_inner_lock = Lock()
 
 
 def get_llama():
-    with llama_lock:
-        yield llama
+    # NOTE: This double lock allows the currently streaming llama model to
+    # check if any other requests are pending in the same thread and cancel
+    # the stream if so.
+    llama_outer_lock.acquire()
+    release_outer_lock = True
+    try:
+        llama_inner_lock.acquire()
+        try:
+            llama_outer_lock.release()
+            release_outer_lock = False
+            yield llama
+        finally:
+            llama_inner_lock.release()
+    finally:
+        if release_outer_lock:
+            llama_outer_lock.release()
 
 
 def get_settings():
@@ -364,6 +379,9 @@ async def event_publisher(inner_send_chan: MemoryObjectSendStream):
                         await inner_send_chan.send(dict(data=json.dumps(chunk)))
                         if await request.is_disconnected():
                             raise anyio.get_cancelled_exc_class()()
+                        if llama_outer_lock.locked():
+                            await inner_send_chan.send(dict(data="[DONE]"))
+                            raise anyio.get_cancelled_exc_class()()
                     await inner_send_chan.send(dict(data="[DONE]"))
                 except anyio.get_cancelled_exc_class() as e:
                     print("disconnected")
@@ -371,7 +389,6 @@ async def event_publisher(inner_send_chan: MemoryObjectSendStream):
                         print(
                             f"Disconnected from client (via refresh/close) {request.client}"
                         )
-                        await inner_send_chan.send(dict(closing=True))
                         raise e
 
         return EventSourceResponse(
@@ -494,6 +511,9 @@ async def event_publisher(inner_send_chan: MemoryObjectSendStream):
                         await inner_send_chan.send(dict(data=json.dumps(chat_chunk)))
                         if await request.is_disconnected():
                             raise anyio.get_cancelled_exc_class()()
+                        if llama_outer_lock.locked():
+                            await inner_send_chan.send(dict(data="[DONE]"))
+                            raise anyio.get_cancelled_exc_class()()
                     await inner_send_chan.send(dict(data="[DONE]"))
                 except anyio.get_cancelled_exc_class() as e:
                     print("disconnected")
@@ -501,7 +521,6 @@ async def event_publisher(inner_send_chan: MemoryObjectSendStream):
                         print(
                             f"Disconnected from client (via refresh/close) {request.client}"
                         )
-                        await inner_send_chan.send(dict(closing=True))
                         raise e
 
         return EventSourceResponse(
@@ -533,8 +552,8 @@ class ModelList(TypedDict):
 @router.get("/v1/models", response_model=GetModelResponse)
 async def get_models(
     settings: Settings = Depends(get_settings),
-    llama: llama_cpp.Llama = Depends(get_llama),
 ) -> ModelList:
+    assert llama is not None
     return {
         "object": "list",
         "data": [

From cc542b4452ec92919bb2964e40314c7077c264be Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 7 Jul 2023 03:04:54 -0400
Subject: [PATCH 05/48] Update llama.cpp

---
 vendor/llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index dfd9fce6d..481f793ac 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit dfd9fce6d65599bf33df43e616e85aa639bdae4c
+Subproject commit 481f793acc3882a09d45d8d2c3076ad3d1c60cfc

From 57d8ec3899f2c48def77f8cf3d3feae45ca12aa3 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 7 Jul 2023 03:37:23 -0400
Subject: [PATCH 06/48] Add setting to control request interruption

---
 llama_cpp/server/app.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index b9d57717e..5d47160e9 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -85,6 +85,10 @@ class Settings(BaseSettings):
     port: int = Field(
         default=8000, description="Listen port"
     )
+    interrupt_requests: bool = Field(
+        default=True,
+        description="Whether to interrupt requests when a new request is received.",
+    )
 
 
 router = APIRouter()
@@ -379,7 +383,7 @@ async def event_publisher(inner_send_chan: MemoryObjectSendStream):
                         await inner_send_chan.send(dict(data=json.dumps(chunk)))
                         if await request.is_disconnected():
                             raise anyio.get_cancelled_exc_class()()
-                        if llama_outer_lock.locked():
+                        if settings.interrupt_requests and llama_outer_lock.locked():
                             await inner_send_chan.send(dict(data="[DONE]"))
                             raise anyio.get_cancelled_exc_class()()
                     await inner_send_chan.send(dict(data="[DONE]"))
@@ -486,6 +490,7 @@ async def create_chat_completion(
     request: Request,
     body: CreateChatCompletionRequest,
     llama: llama_cpp.Llama = Depends(get_llama),
+    settings: Settings = Depends(get_settings),
 ) -> Union[llama_cpp.ChatCompletion, EventSourceResponse]:
     exclude = {
         "n",
@@ -511,7 +516,7 @@ async def event_publisher(inner_send_chan: MemoryObjectSendStream):
                         await inner_send_chan.send(dict(data=json.dumps(chat_chunk)))
                         if await request.is_disconnected():
                             raise anyio.get_cancelled_exc_class()()
-                        if llama_outer_lock.locked():
+                        if settings.interrupt_requests and llama_outer_lock.locked():
                             await inner_send_chan.send(dict(data="[DONE]"))
                             raise anyio.get_cancelled_exc_class()()
                     await inner_send_chan.send(dict(data="[DONE]"))

From ca11673061ecd9198b4800f68073ae14d4440ecd Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 7 Jul 2023 03:38:51 -0400
Subject: [PATCH 07/48] Add universal docker image

---
 Makefile                 |  7 +++++++
 docker/simple/Dockerfile | 33 +++++++++++++++++++++++++++++++++
 docker/simple/run.sh     |  4 ++++
 3 files changed, 44 insertions(+)
 create mode 100644 docker/simple/Dockerfile
 create mode 100644 docker/simple/run.sh

diff --git a/Makefile b/Makefile
index 1be35cf8f..c359260b6 100644
--- a/Makefile
+++ b/Makefile
@@ -36,6 +36,12 @@ deploy.gh-docs:
 test:
 	python3 -m pytest
 
+docker:
+	docker build -t llama-cpp-python:latest -f docker/simple/Dockerfile .
+
+run-server:
+	uvicorn --factory llama.server:app --host ${HOST} --port ${PORT}
+
 clean:
 	- cd vendor/llama.cpp && make clean
 	- cd vendor/llama.cpp && rm libllama.so
@@ -56,4 +62,5 @@ clean:
 	build.sdist \
 	deploy.pypi \
 	deploy.gh-docs \
+	docker \
 	clean
\ No newline at end of file
diff --git a/docker/simple/Dockerfile b/docker/simple/Dockerfile
new file mode 100644
index 000000000..ad36b989a
--- /dev/null
+++ b/docker/simple/Dockerfile
@@ -0,0 +1,33 @@
+# Define the image argument and provide a default value
+ARG IMAGE=python:3-slim-bullseye
+
+# Use the image as specified
+FROM ${IMAGE}
+
+# Re-declare the ARG after FROM
+ARG IMAGE
+
+# Update and upgrade the existing packages 
+RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-recommends \
+    python3 \
+    python3-pip \
+    ninja-build \
+    build-essential
+
+RUN mkdir /app
+WORKDIR /app
+COPY . /app
+
+RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette
+
+RUN make build && make clean
+
+# Set environment variable for the host
+ENV HOST=0.0.0.0
+ENV PORT=8000
+
+# Expose a port for the server
+EXPOSE 8000
+
+# Run the server start script
+CMD ["/bin/sh", "/app/docker/simple/run.sh"]
diff --git a/docker/simple/run.sh b/docker/simple/run.sh
new file mode 100644
index 000000000..c85e73d2b
--- /dev/null
+++ b/docker/simple/run.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+make build
+uvicorn --factory llama_cpp.server.app:create_app --host $HOST --port $PORT

From d270ec231ad620beeb20da93de3b05f7a2d55cb4 Mon Sep 17 00:00:00 2001
From: Audrey Roy Greenfeld <aroy@alum.mit.edu>
Date: Fri, 7 Jul 2023 11:15:04 +0100
Subject: [PATCH 08/48] Update macOS Metal GPU step 4

* Update "today" to version 0.1.62
* Fix numbering (there were 2 step 4's)
---
 docs/install/macos.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/install/macos.md b/docs/install/macos.md
index 600469615..3330396e3 100644
--- a/docs/install/macos.md
+++ b/docs/install/macos.md
@@ -26,19 +26,19 @@ conda create -n llama python=3.9.16
 conda activate llama
 ```
 
-**(4) Install the LATEST llama-cpp-python.. which, as of just today, happily supports MacOS Metal GPU**  
+**(4) Install the LATEST llama-cpp-python...which happily supports MacOS Metal GPU as of version 0.1.62**  
     *(you needed xcode installed in order pip to build/compile the C++ code)*
 ```
 pip uninstall llama-cpp-python -y
 CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 pip install -U llama-cpp-python --no-cache-dir
 pip install 'llama-cpp-python[server]'
 
-# you should now have llama-cpp-python v0.1.62 installed
-llama-cpp-python         0.1.62      
+# you should now have llama-cpp-python v0.1.62 or higher installed
+llama-cpp-python         0.1.68
 
 ```
 
-**(4) Download a v3 ggml model**
+**(5) Download a v3 ggml model**
  - **ggmlv3**
  - file name ends with **q4_0.bin** - indicating it is 4bit quantized, with quantisation method 0
 

From 9e61661518d78973555cb0424d371e943674cd88 Mon Sep 17 00:00:00 2001
From: wu-qing-157 <wuqing157@gmail.com>
Date: Fri, 7 Jul 2023 10:18:49 +0000
Subject: [PATCH 09/48] fix indexing token_logprobs after sorting

---
 llama_cpp/llama.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 688b2a74f..31d70b7ec 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -958,7 +958,7 @@ def _create_completion(
                                 )
                             ],
                             "text_offset": [text_offset],
-                            "token_logprobs": [sorted_logprobs[int(token)][0]],
+                            "token_logprobs": [current_logprobs[int(token)]],
                             "top_logprobs": [top_logprob],
                         }
                     returned_tokens += 1
@@ -1033,7 +1033,7 @@ def _create_completion(
                             self.detokenize([token]).decode("utf-8", errors="ignore")
                         ],
                         "text_offset": [text_offset],
-                        "token_logprobs": [sorted_logprobs[int(token)][0]],
+                        "token_logprobs": [current_logprobs[int(token)]],
                         "top_logprobs": [top_logprob],
                     }
 
@@ -1131,7 +1131,7 @@ def _create_completion(
                         zip(logprobs_token, range(len(logprobs_token))), reverse=True
                     )
                 )
-                token_logprobs.append(sorted_logprobs[int(token)][0])
+                token_logprobs.append(logprobs_token[int(token)])
                 top_logprob: Optional[Dict[str, float]] = {
                     self.detokenize([i]).decode("utf-8", errors="ignore"): logprob
                     for logprob, i in sorted_logprobs[:logprobs]

From a14d8a9b3fdc2f967c7c8905fe7911bddb0935a0 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 7 Jul 2023 18:58:43 -0400
Subject: [PATCH 10/48] perf: assign to candidates data structure instead

---
 llama_cpp/llama.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 688b2a74f..35823cfa3 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -487,9 +487,9 @@ def _sample(
         nl_logit = logits[self._token_nl]
         candidates = self._candidates
         candidates_data = self._candidates_data
-        candidates_data["id"] = np.arange(n_vocab, dtype=np.intc)  # type: ignore
-        candidates_data["logit"] = logits
-        candidates_data["p"] = np.zeros(n_vocab, dtype=np.single)
+        candidates_data["id"][:] = np.arange(n_vocab, dtype=np.intc)  # type: ignore
+        candidates_data["logit"][:] = logits
+        candidates_data["p"][:] = np.zeros(n_vocab, dtype=np.single)
         candidates.data = candidates_data.ctypes.data_as(llama_cpp.llama_token_data_p)
         candidates.sorted = llama_cpp.c_bool(False)
         candidates.size = llama_cpp.c_size_t(n_vocab)

From 7887376bffec533083f2d2170424db076089c39d Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 7 Jul 2023 19:06:54 -0400
Subject: [PATCH 11/48] Update llama.cpp

---
 vendor/llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 481f793ac..061f5f8d2 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 481f793acc3882a09d45d8d2c3076ad3d1c60cfc
+Subproject commit 061f5f8d2109bb7adcbd40f1b456d887c5a1df25

From 11eae752110f3f69088c6a551c965f42f1507148 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 7 Jul 2023 19:28:53 -0400
Subject: [PATCH 12/48] perf: avoid allocating new buffers during sampling

---
 llama_cpp/llama.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 35823cfa3..089518255 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -324,6 +324,8 @@ def __init__(
         self._candidates = candidates
         self._token_nl = Llama.token_nl()
         self._token_eos = Llama.token_eos()
+        self._candidates_data_id = np.arange(self._n_vocab, dtype=np.intc) # type: ignore
+        self._candidates_data_p = np.zeros(self._n_vocab, dtype=np.single)
 
         self.n_tokens = 0
         self.input_ids: npt.NDArray[np.intc] = np.ndarray((n_ctx,), dtype=np.intc)
@@ -487,9 +489,9 @@ def _sample(
         nl_logit = logits[self._token_nl]
         candidates = self._candidates
         candidates_data = self._candidates_data
-        candidates_data["id"][:] = np.arange(n_vocab, dtype=np.intc)  # type: ignore
+        candidates_data["id"][:] = self._candidates_data_id  # type: ignore
         candidates_data["logit"][:] = logits
-        candidates_data["p"][:] = np.zeros(n_vocab, dtype=np.single)
+        candidates_data["p"][:] = self._candidates_data_p # type: ignore
         candidates.data = candidates_data.ctypes.data_as(llama_cpp.llama_token_data_p)
         candidates.sorted = llama_cpp.c_bool(False)
         candidates.size = llama_cpp.c_size_t(n_vocab)

From 52753b77f556c46057f5272b2ee547868cf53397 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 7 Jul 2023 21:38:46 -0400
Subject: [PATCH 13/48] Upgrade fastapi to 0.100.0 and pydantic v2

---
 .github/workflows/test.yaml       |  6 +++---
 docker/cuda_simple/Dockerfile     |  2 +-
 docker/open_llama/Dockerfile      |  2 +-
 docker/openblas_simple/Dockerfile |  2 +-
 docker/simple/Dockerfile          |  2 +-
 llama_cpp/server/__main__.py      |  4 ++--
 llama_cpp/server/app.py           | 14 ++++----------
 pyproject.toml                    |  2 +-
 setup.py                          |  2 +-
 9 files changed, 15 insertions(+), 21 deletions(-)

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 56524e0db..a73e347b5 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -26,7 +26,7 @@ jobs:
           python-version: ${{ matrix.python-version }}
       - name: Install dependencies
         run: |
-          python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi sse-starlette httpx uvicorn
+          python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi sse-starlette httpx uvicorn pydantic-settings
           pip install . -v
       - name: Test with pytest
         run: |
@@ -49,7 +49,7 @@ jobs:
           python-version: ${{ matrix.python-version }}
       - name: Install dependencies
         run: |
-          python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi sse-starlette httpx uvicorn
+          python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi sse-starlette httpx uvicorn pydantic-settings
           pip install . -v
       - name: Test with pytest
         run: |
@@ -72,7 +72,7 @@ jobs:
           python-version: ${{ matrix.python-version }}
       - name: Install dependencies
         run: |
-          python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi sse-starlette httpx uvicorn
+          python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi sse-starlette httpx uvicorn pydantic-settings
           pip install . -v
       - name: Test with pytest
         run: |
diff --git a/docker/cuda_simple/Dockerfile b/docker/cuda_simple/Dockerfile
index 24906d53a..e4a2f07e2 100644
--- a/docker/cuda_simple/Dockerfile
+++ b/docker/cuda_simple/Dockerfile
@@ -8,7 +8,7 @@ COPY . .
 
 # Install the package
 RUN apt update && apt install -y python3 python3-pip
-RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette
+RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings
 
 RUN LLAMA_CUBLAS=1 pip install llama-cpp-python
 
diff --git a/docker/open_llama/Dockerfile b/docker/open_llama/Dockerfile
index f0ef5f721..7788f33de 100644
--- a/docker/open_llama/Dockerfile
+++ b/docker/open_llama/Dockerfile
@@ -14,7 +14,7 @@ RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-reco
     ninja-build \
     build-essential
 
-RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette
+RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings
 
 # Perform the conditional installations based on the image
 RUN echo "Image: ${IMAGE}" && \
diff --git a/docker/openblas_simple/Dockerfile b/docker/openblas_simple/Dockerfile
index 1a95caeda..8231bdb96 100644
--- a/docker/openblas_simple/Dockerfile
+++ b/docker/openblas_simple/Dockerfile
@@ -7,7 +7,7 @@ COPY . .
 
 # Install the package
 RUN apt update && apt install -y libopenblas-dev ninja-build build-essential
-RUN python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette
+RUN python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings
 
 RUN LLAMA_OPENBLAS=1 pip install llama_cpp_python --verbose
 
diff --git a/docker/simple/Dockerfile b/docker/simple/Dockerfile
index ad36b989a..77680c811 100644
--- a/docker/simple/Dockerfile
+++ b/docker/simple/Dockerfile
@@ -18,7 +18,7 @@ RUN mkdir /app
 WORKDIR /app
 COPY . /app
 
-RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette
+RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings
 
 RUN make build && make clean
 
diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py
index 748a2af33..2110db31f 100644
--- a/llama_cpp/server/__main__.py
+++ b/llama_cpp/server/__main__.py
@@ -3,7 +3,7 @@
 To run this example:
 
 ```bash
-pip install fastapi uvicorn sse-starlette
+pip install fastapi uvicorn sse-starlette pydantic-settings
 export MODEL=../models/7B/...
 ```
 
@@ -30,7 +30,7 @@
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    for name, field in Settings.__fields__.items():
+    for name, field in Settings.__model_fields__.items():
         description = field.field_info.description
         if field.default is not None and description is not None:
             description += f" (default: {field.default})"
diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index 5d47160e9..ffd07fa6b 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -12,7 +12,8 @@
 from starlette.concurrency import run_in_threadpool, iterate_in_threadpool
 from fastapi import Depends, FastAPI, APIRouter, Request
 from fastapi.middleware.cors import CORSMiddleware
-from pydantic import BaseModel, BaseSettings, Field, create_model_from_typeddict
+from pydantic import BaseModel, Field
+from pydantic_settings import BaseSettings
 from sse_starlette.sse import EventSourceResponse
 
 
@@ -309,7 +310,6 @@ class Config:
         }
 
 
-CreateCompletionResponse = create_model_from_typeddict(llama_cpp.Completion)
 
 
 def make_logit_bias_processor(
@@ -347,7 +347,6 @@ def logit_bias_processor(
 
 @router.post(
     "/v1/completions",
-    response_model=CreateCompletionResponse,
 )
 async def create_completion(
     request: Request,
@@ -416,12 +415,10 @@ class Config:
         }
 
 
-CreateEmbeddingResponse = create_model_from_typeddict(llama_cpp.Embedding)
 
 
 @router.post(
     "/v1/embeddings",
-    response_model=CreateEmbeddingResponse,
 )
 async def create_embedding(
     request: CreateEmbeddingRequest, llama: llama_cpp.Llama = Depends(get_llama)
@@ -479,19 +476,17 @@ class Config:
         }
 
 
-CreateChatCompletionResponse = create_model_from_typeddict(llama_cpp.ChatCompletion)
 
 
 @router.post(
     "/v1/chat/completions",
-    response_model=CreateChatCompletionResponse,
 )
 async def create_chat_completion(
     request: Request,
     body: CreateChatCompletionRequest,
     llama: llama_cpp.Llama = Depends(get_llama),
     settings: Settings = Depends(get_settings),
-) -> Union[llama_cpp.ChatCompletion, EventSourceResponse]:
+) -> Union[llama_cpp.ChatCompletion]: # type: ignore
     exclude = {
         "n",
         "logit_bias",
@@ -551,10 +546,9 @@ class ModelList(TypedDict):
     data: List[ModelData]
 
 
-GetModelResponse = create_model_from_typeddict(ModelList)
 
 
-@router.get("/v1/models", response_model=GetModelResponse)
+@router.get("/v1/models")
 async def get_models(
     settings: Settings = Depends(get_settings),
 ) -> ModelList:
diff --git a/pyproject.toml b/pyproject.toml
index b3ad3b411..841a86869 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -32,7 +32,7 @@ httpx = "^0.24.1"
 scikit-build = "0.17.6"
 
 [tool.poetry.extras]
-server = ["uvicorn", "fastapi", "sse-starlette"]
+server = ["uvicorn>=0.22.0", "fastapi>=0.100.0", "pydantic-settings>=2.0.1", "sse-starlette>=1.6.1"]
 
 [build-system]
 requires = [
diff --git a/setup.py b/setup.py
index 32101eb07..1d7ecbce0 100644
--- a/setup.py
+++ b/setup.py
@@ -18,7 +18,7 @@
     packages=["llama_cpp", "llama_cpp.server"],
     install_requires=["typing-extensions>=4.5.0", "numpy>=1.20.0", "diskcache>=5.6.1"],
     extras_require={
-        "server": ["uvicorn>=0.21.1", "fastapi>=0.95.0", "sse-starlette>=1.3.3"],
+        "server": ["uvicorn>=0.22.1", "fastapi>=0.100.0", "pydantic-settings>=2.0.1", "sse-starlette>=1.6.1"],
     },
     python_requires=">=3.7",
     classifiers=[

From 34c505edf2609acef51b47533f10cd2b8dc2f715 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 7 Jul 2023 22:54:07 -0400
Subject: [PATCH 14/48] perf: convert pointer to byref

---
 llama_cpp/llama.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 089518255..130e01390 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -537,7 +537,7 @@ def _sample(
             mirostat_mu = llama_cpp.c_float(2.0 * mirostat_tau.value)
             llama_cpp.llama_sample_temperature(
                 ctx=self.ctx,
-                candidates=llama_cpp.ctypes.pointer(candidates),
+                candidates=llama_cpp.ctypes.byref(candidates), # type: ignore
                 temp=temp,
             )
             return llama_cpp.llama_sample_token_mirostat_v2(

From ea4fbadab39548673e2a835223968b023006e539 Mon Sep 17 00:00:00 2001
From: AgentJ-WR <60302956+AgentJ-WR@users.noreply.github.com>
Date: Fri, 7 Jul 2023 23:24:57 -0400
Subject: [PATCH 15/48] Show how to adjust context window in README.md

---
 README.md | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/README.md b/README.md
index fb652a925..0322c73a3 100644
--- a/README.md
+++ b/README.md
@@ -105,6 +105,15 @@ Below is a short example demonstrating how to use the high-level API to generate
 }
 ```
 
+### Adjusting the Context Window
+The context window of the Llama models determines the maximum number of tokens that can be processed at once. By default, this is set to 512 tokens, but can be adjusted based on your requirements.
+
+For instance, if you want to work with larger contexts, you can expand the context window by setting the n_ctx parameter when initializing the Llama object:
+
+```python
+llm = Llama(model_path="./models/7B/ggml-model.bin", n_ctx=2048)
+```
+
 ## Web Server
 
 `llama-cpp-python` offers a web server which aims to act as a drop-in replacement for the OpenAI API.

From 4f2b5d0b5321bedc879ee9b9a19ca15d18ddb995 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sat, 8 Jul 2023 00:05:10 -0400
Subject: [PATCH 16/48] Format

---
 llama_cpp/llama.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 130e01390..f8e05271c 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -324,7 +324,7 @@ def __init__(
         self._candidates = candidates
         self._token_nl = Llama.token_nl()
         self._token_eos = Llama.token_eos()
-        self._candidates_data_id = np.arange(self._n_vocab, dtype=np.intc) # type: ignore
+        self._candidates_data_id = np.arange(self._n_vocab, dtype=np.intc)  # type: ignore
         self._candidates_data_p = np.zeros(self._n_vocab, dtype=np.single)
 
         self.n_tokens = 0
@@ -445,8 +445,12 @@ def eval(self, tokens: Sequence[int]):
             # Save logits
             rows = n_tokens if self.params.logits_all else 1
             cols = self._n_vocab
-            offset = 0 if self.params.logits_all else n_tokens - 1 # NOTE: Only save the last token logits if logits_all is False
-            self.scores[self.n_tokens + offset: self.n_tokens + n_tokens, :].reshape(-1)[:] = llama_cpp.llama_get_logits(self.ctx)[:rows * cols]
+            offset = (
+                0 if self.params.logits_all else n_tokens - 1
+            )  # NOTE: Only save the last token logits if logits_all is False
+            self.scores[self.n_tokens + offset : self.n_tokens + n_tokens, :].reshape(
+                -1
+            )[:] = llama_cpp.llama_get_logits(self.ctx)[: rows * cols]
             # Update n_tokens
             self.n_tokens += n_tokens
 
@@ -491,7 +495,7 @@ def _sample(
         candidates_data = self._candidates_data
         candidates_data["id"][:] = self._candidates_data_id  # type: ignore
         candidates_data["logit"][:] = logits
-        candidates_data["p"][:] = self._candidates_data_p # type: ignore
+        candidates_data["p"][:] = self._candidates_data_p  # type: ignore
         candidates.data = candidates_data.ctypes.data_as(llama_cpp.llama_token_data_p)
         candidates.sorted = llama_cpp.c_bool(False)
         candidates.size = llama_cpp.c_size_t(n_vocab)
@@ -537,7 +541,7 @@ def _sample(
             mirostat_mu = llama_cpp.c_float(2.0 * mirostat_tau.value)
             llama_cpp.llama_sample_temperature(
                 ctx=self.ctx,
-                candidates=llama_cpp.ctypes.byref(candidates), # type: ignore
+                candidates=llama_cpp.ctypes.byref(candidates),  # type: ignore
                 temp=temp,
             )
             return llama_cpp.llama_sample_token_mirostat_v2(

From d6e6aad927690d4bb3229be3f7980a64e46d4866 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sat, 8 Jul 2023 00:06:11 -0400
Subject: [PATCH 17/48] bugfix: fix compatibility bug with openai api on last
 token

---
 llama_cpp/llama.py       | 36 ++++++++++++++++++++++++++++++++----
 llama_cpp/llama_types.py |  6 ++++--
 2 files changed, 36 insertions(+), 6 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index f8e05271c..d7d3e85e7 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -1060,6 +1060,20 @@ def _create_completion(
                                 ].decode("utf-8", errors="ignore"),
                                 "index": 0,
                                 "logprobs": logprobs_or_none,
+                                "finish_reason": None,
+                            }
+                        ],
+                    }
+                    yield {
+                        "id": completion_id,
+                        "object": "text_completion",
+                        "created": created,
+                        "model": model_name,
+                        "choices": [
+                            {
+                                "text": "",
+                                "index": 0,
+                                "logprobs": None,
                                 "finish_reason": finish_reason,
                             }
                         ],
@@ -1078,9 +1092,21 @@ def _create_completion(
                             ),
                             "index": 0,
                             "logprobs": logprobs_or_none,
-                            "finish_reason": finish_reason
-                            if returned_tokens == len(completion_tokens)
-                            else None,
+                            "finish_reason": None,
+                        }
+                    ],
+                }
+                yield {
+                    "id": completion_id,
+                    "object": "text_completion",
+                    "created": created,
+                    "model": model_name,
+                    "choices": [
+                        {
+                            "text": "",
+                            "index": 0,
+                            "logprobs": None,
+                            "finish_reason": finish_reason,
                         }
                     ],
                 }
@@ -1370,7 +1396,9 @@ def _convert_text_completion_chunks_to_chat(
                         "index": 0,
                         "delta": {
                             "content": chunk["choices"][0]["text"],
-                        },
+                        }
+                        if chunk["choices"][0]["finish_reason"] is None
+                        else {},
                         "finish_reason": chunk["choices"][0]["finish_reason"],
                     }
                 ],
diff --git a/llama_cpp/llama_types.py b/llama_cpp/llama_types.py
index 7729ced5a..6ba8023bd 100644
--- a/llama_cpp/llama_types.py
+++ b/llama_cpp/llama_types.py
@@ -1,4 +1,4 @@
-from typing import List, Optional, Dict
+from typing import Any, List, Optional, Dict, Union
 from typing_extensions import TypedDict, NotRequired, Literal
 
 
@@ -77,6 +77,8 @@ class ChatCompletion(TypedDict):
     choices: List[ChatCompletionChoice]
     usage: CompletionUsage
 
+class ChatCompletionChunkDeltaEmpty(TypedDict):
+    pass
 
 class ChatCompletionChunkDelta(TypedDict):
     role: NotRequired[Literal["assistant"]]
@@ -85,7 +87,7 @@ class ChatCompletionChunkDelta(TypedDict):
 
 class ChatCompletionChunkChoice(TypedDict):
     index: int
-    delta: ChatCompletionChunkDelta
+    delta: Union[ChatCompletionChunkDelta, ChatCompletionChunkDeltaEmpty]
     finish_reason: Optional[str]
 
 

From 670fe4b701b2c8a8a97bac5293bb65004507a1f8 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sat, 8 Jul 2023 03:37:12 -0400
Subject: [PATCH 18/48] Update changelog

---
 CHANGELOG.md | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 11251c6df..805d7be2b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,30 +7,36 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
-## [Added]
+### Added
 
 - (server) Streaming requests can are now interrupted pre-maturely when a concurrent request is made. Can be controlled with the `interrupt_requests` setting.
+- (server) Moved to fastapi v0.100.0 and pydantic v2
+- (docker) Added a new "simple" image that builds llama.cpp from source when started.
+
+## Fixed
+
+- (server) performance improvements by avoiding unnecessary memory allocations during sampling
 
 ## [0.1.68]
 
-## [Added]
+### Added
 
 - (llama.cpp) Update llama.cpp
 
 ## [0.1.67]
 
-## Fixed
+### Fixed
 
 - Fix performance bug in Llama model by pre-allocating memory tokens and logits.
 - Fix bug in Llama model where the model was not free'd after use.
 
 ## [0.1.66]
 
-## Added
+### Added
 
 - (llama.cpp) New model API
 
-## Fixed
+### Fixed
 
 - Performance issue during eval caused by looped np.concatenate call
 - State pickling issue when saving cache to disk

From 3a2635b9e1d591a4823f1d302e12cdc2a84a8b18 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sat, 8 Jul 2023 03:37:28 -0400
Subject: [PATCH 19/48] Update docker workflow for new simple image

---
 .github/workflows/build-docker.yaml | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/.github/workflows/build-docker.yaml b/.github/workflows/build-docker.yaml
index 16b00a2f0..25669b77d 100644
--- a/.github/workflows/build-docker.yaml
+++ b/.github/workflows/build-docker.yaml
@@ -11,10 +11,6 @@ jobs:
     name: Build and push Docker image
     runs-on: ubuntu-latest
     steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: "true"
 
       - name: Set up QEMU
         uses: docker/setup-qemu-action@v2
@@ -33,6 +29,7 @@ jobs:
         uses: docker/build-push-action@v4
         with:
           context: .
+          path: ./docker/simple/Dockerfile
           push: true # push to registry
           pull: true # always fetch the latest base images
           platforms: linux/amd64,linux/arm64 # build for both amd64 and arm64

From 5b7d76608d8169e6d7696c12f0366e6ccdd6cd0c Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sat, 8 Jul 2023 03:43:17 -0400
Subject: [PATCH 20/48] docker: add checkout action to dockerfile

---
 .github/workflows/build-docker.yaml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/build-docker.yaml b/.github/workflows/build-docker.yaml
index 25669b77d..e0bf79ce1 100644
--- a/.github/workflows/build-docker.yaml
+++ b/.github/workflows/build-docker.yaml
@@ -11,6 +11,10 @@ jobs:
     name: Build and push Docker image
     runs-on: ubuntu-latest
     steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: "true"
 
       - name: Set up QEMU
         uses: docker/setup-qemu-action@v2

From 9e153fd11d1de032181619206213504c3ade1068 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sat, 8 Jul 2023 03:44:51 -0400
Subject: [PATCH 21/48] docker: update context path

---
 .github/workflows/build-docker.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build-docker.yaml b/.github/workflows/build-docker.yaml
index e0bf79ce1..04761e53d 100644
--- a/.github/workflows/build-docker.yaml
+++ b/.github/workflows/build-docker.yaml
@@ -33,7 +33,7 @@ jobs:
         uses: docker/build-push-action@v4
         with:
           context: .
-          path: ./docker/simple/Dockerfile
+          path: "{context}/docker/simple/Dockerfile"
           push: true # push to registry
           pull: true # always fetch the latest base images
           platforms: linux/amd64,linux/arm64 # build for both amd64 and arm64

From 1f5e748a7e284c6376d637dd11e54be611972483 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sat, 8 Jul 2023 04:00:43 -0400
Subject: [PATCH 22/48] docker: fix docker build action args

---
 .github/workflows/build-docker.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build-docker.yaml b/.github/workflows/build-docker.yaml
index 04761e53d..0e1738fd4 100644
--- a/.github/workflows/build-docker.yaml
+++ b/.github/workflows/build-docker.yaml
@@ -33,7 +33,7 @@ jobs:
         uses: docker/build-push-action@v4
         with:
           context: .
-          path: "{context}/docker/simple/Dockerfile"
+          file: "{context}/docker/simple/Dockerfile"
           push: true # push to registry
           pull: true # always fetch the latest base images
           platforms: linux/amd64,linux/arm64 # build for both amd64 and arm64

From 3c85c4157391364e2a7a3ff5818c5673d5f61696 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sat, 8 Jul 2023 04:04:11 -0400
Subject: [PATCH 23/48] docker: update path to dockerfile

---
 .github/workflows/build-docker.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build-docker.yaml b/.github/workflows/build-docker.yaml
index 0e1738fd4..4b38dbacb 100644
--- a/.github/workflows/build-docker.yaml
+++ b/.github/workflows/build-docker.yaml
@@ -33,7 +33,7 @@ jobs:
         uses: docker/build-push-action@v4
         with:
           context: .
-          file: "{context}/docker/simple/Dockerfile"
+          file: "docker/simple/Dockerfile"
           push: true # push to registry
           pull: true # always fetch the latest base images
           platforms: linux/amd64,linux/arm64 # build for both amd64 and arm64

From 00da643929d4ce81437f3b77cf29ec0a9af18b55 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sat, 8 Jul 2023 20:30:34 -0400
Subject: [PATCH 24/48] Update llama.cpp

---
 vendor/llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 061f5f8d2..64639555f 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 061f5f8d2109bb7adcbd40f1b456d887c5a1df25
+Subproject commit 64639555ff93c8ead2b80becb49cc6b60aeac240

From 99f064e6812775fa74e0bcb90f069b9025940a3e Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sun, 9 Jul 2023 01:36:39 -0400
Subject: [PATCH 25/48] docker: Add libopenblas to simple image

---
 docker/simple/Dockerfile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docker/simple/Dockerfile b/docker/simple/Dockerfile
index 77680c811..507b2ba46 100644
--- a/docker/simple/Dockerfile
+++ b/docker/simple/Dockerfile
@@ -12,6 +12,7 @@ RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-reco
     python3 \
     python3-pip \
     ninja-build \
+    libopenblas-dev \
     build-essential
 
 RUN mkdir /app

From 9f21f548a5a61e96e637c6baea7596fd8cdc0b01 Mon Sep 17 00:00:00 2001
From: Shouyi Wang <nswsyw@gmail.com>
Date: Fri, 7 Jul 2023 19:22:10 +1000
Subject: [PATCH 26/48] Add tensor split

---
 llama_cpp/llama.py      | 13 ++++++++++++-
 llama_cpp/server/app.py |  5 +++++
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 62e0daee3..aefb8a3af 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -19,6 +19,7 @@
 from collections import deque, OrderedDict
 
 import diskcache
+import ctypes
 
 from . import llama_cpp
 from .llama_types import *
@@ -26,7 +27,6 @@
 import numpy as np
 import numpy.typing as npt
 
-
 class BaseLlamaCache(ABC):
     """Base cache class for a llama.cpp model."""
 
@@ -207,6 +207,7 @@ def __init__(
         n_ctx: int = 512,
         n_parts: int = -1,
         n_gpu_layers: int = 0,
+        tensor_split: list[float] = None,
         seed: int = 1337,
         f16_kv: bool = True,
         logits_all: bool = False,
@@ -248,12 +249,20 @@ def __init__(
         Returns:
             A Llama instance.
         """
+        if tensor_split is None:
+            tensor_split = [0.0] * llama_cpp.LLAMA_MAX_DEVICES.value
+
+        #Type conversion and expand the list to the length of LLAMA_MAX_DEVICES
+        FloatArray = ctypes.c_float * llama_cpp.LLAMA_MAX_DEVICES.value
+        c_tensor_split = FloatArray(*tensor_split)
+
         self.verbose = verbose
         self.model_path = model_path
 
         self.params = llama_cpp.llama_context_default_params()
         self.params.n_ctx = n_ctx
         self.params.n_gpu_layers = n_gpu_layers
+        self.params.tensor_split = c_tensor_split
         self.params.seed = seed
         self.params.f16_kv = f16_kv
         self.params.logits_all = logits_all
@@ -1494,6 +1503,7 @@ def __getstate__(self):
             model_path=self.model_path,
             n_ctx=self.params.n_ctx,
             n_gpu_layers=self.params.n_gpu_layers,
+            tensor_split=self.params.tensor_split,
             seed=self.params.seed,
             f16_kv=self.params.f16_kv,
             logits_all=self.params.logits_all,
@@ -1518,6 +1528,7 @@ def __setstate__(self, state):
             n_ctx=state["n_ctx"],
             n_parts=state["n_parts"],
             n_gpu_layers=state["n_gpu_layers"],
+            tensor_split=state["tensor_split"],
             seed=state["seed"],
             f16_kv=state["f16_kv"],
             logits_all=state["logits_all"],
diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index ffd07fa6b..7b09d8443 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -31,6 +31,10 @@ class Settings(BaseSettings):
         ge=0,
         description="The number of layers to put on the GPU. The rest will be on the CPU.",
     )
+    tensor_split: List[float] = Field(
+        default=None,
+        description="Split layers across multiple GPUs in proportion.",
+    )
     seed: int = Field(
         default=1337, description="Random seed. -1 for random."
     )
@@ -117,6 +121,7 @@ def create_app(settings: Optional[Settings] = None):
     llama = llama_cpp.Llama(
         model_path=settings.model,
         n_gpu_layers=settings.n_gpu_layers,
+        tensor_split=settings.tensor_split,
         seed=settings.seed,
         f16_kv=settings.f16_kv,
         use_mlock=settings.use_mlock,

From 9aa64163dbf57a6f36b80ba1b8399b050607b9c7 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sun, 9 Jul 2023 11:40:59 -0400
Subject: [PATCH 27/48] Update llama.cpp

---
 vendor/llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 64639555f..1d1630996 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 64639555ff93c8ead2b80becb49cc6b60aeac240
+Subproject commit 1d1630996920f889cdc08de26cebf2415958540e

From 0f3c474a49af412117449b19a2844f84c23205ca Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sun, 9 Jul 2023 11:44:29 -0400
Subject: [PATCH 28/48] Bump version

---
 CHANGELOG.md   | 2 ++
 pyproject.toml | 2 +-
 setup.py       | 2 +-
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 805d7be2b..0e181d691 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.1.69]
+
 ### Added
 
 - (server) Streaming requests can are now interrupted pre-maturely when a concurrent request is made. Can be controlled with the `interrupt_requests` setting.
diff --git a/pyproject.toml b/pyproject.toml
index 841a86869..fb1962936 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "llama_cpp_python"
-version = "0.1.68"
+version = "0.1.69"
 description = "Python bindings for the llama.cpp library"
 authors = ["Andrei Betlen <abetlen@gmail.com>"]
 license = "MIT"
diff --git a/setup.py b/setup.py
index 1d7ecbce0..baaabcc47 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@
     description="A Python wrapper for llama.cpp",
     long_description=long_description,
     long_description_content_type="text/markdown",
-    version="0.1.68",
+    version="0.1.69",
     author="Andrei Betlen",
     author_email="abetlen@gmail.com",
     license="MIT",

From 6f70cc4b7dd950a95708ed7e7da9ac550e87a76c Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sun, 9 Jul 2023 18:03:31 -0400
Subject: [PATCH 29/48] bugfix: pydantic settings missing / changed fields

---
 llama_cpp/server/__main__.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llama_cpp/server/__main__.py b/llama_cpp/server/__main__.py
index 2110db31f..995dd4449 100644
--- a/llama_cpp/server/__main__.py
+++ b/llama_cpp/server/__main__.py
@@ -30,14 +30,14 @@
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    for name, field in Settings.__model_fields__.items():
-        description = field.field_info.description
+    for name, field in Settings.model_fields.items():
+        description = field.description
         if field.default is not None and description is not None:
             description += f" (default: {field.default})"
         parser.add_argument(
             f"--{name}",
             dest=name,
-            type=field.type_,
+            type=field.annotation if field.annotation is not None else str,
             help=description,
         )
 

From a86bfdf0a50f23a6aebb3f095ada0afcf8791d6e Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sun, 9 Jul 2023 18:13:29 -0400
Subject: [PATCH 30/48] bugfix: truncate completion max_tokens to fit context
 length by default

---
 llama_cpp/llama.py | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 62e0daee3..edb68c9e5 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -824,19 +824,15 @@ def _create_completion(
         if self.verbose:
             llama_cpp.llama_reset_timings(self.ctx)
 
-        if max_tokens <= 0:
-            # Unlimited, depending on n_ctx.
-            if len(prompt_tokens) >= int(llama_cpp.llama_n_ctx(self.ctx)):
-                raise ValueError(
-                    f"Requested tokens exceed context window of {llama_cpp.llama_n_ctx(self.ctx)}"
-                )
-            else:
-                max_tokens = int(llama_cpp.llama_n_ctx(self.ctx)) - len(prompt_tokens)
-        elif len(prompt_tokens) + max_tokens > int(llama_cpp.llama_n_ctx(self.ctx)):
+        if len(prompt_tokens) >= llama_cpp.llama_n_ctx(self.ctx):
             raise ValueError(
-                f"Requested tokens ({len(prompt_tokens)}) exceed context window of {self._n_ctx}"
+                f"Requested tokens exceed context window of {llama_cpp.llama_n_ctx(self.ctx)}"
             )
 
+        if max_tokens <= 0:
+            # Unlimited, depending on n_ctx.
+            max_tokens = llama_cpp.llama_n_ctx(self.ctx) - len(prompt_tokens)
+
         # Truncate max_tokens if requested tokens would exceed the context window
         max_tokens = (
             max_tokens

From df3d54593868fbe5e8e488cd0c7a638971fbd3b8 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sun, 9 Jul 2023 18:13:41 -0400
Subject: [PATCH 31/48] Update changelog

---
 CHANGELOG.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0e181d691..40974132b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Fixed
+
+- (Llama.create_completion) Revert change so that `max_tokens` is not truncated to `context_size` in `create_completion`
+- (server) Fixed changed settings field names from pydantic v2 migration
+
 ## [0.1.69]
 
 ### Added

From c988c2ac0b7611e4fe8001a28002767f37e09675 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sun, 9 Jul 2023 18:19:37 -0400
Subject: [PATCH 32/48] Bump version

---
 pyproject.toml | 2 +-
 setup.py       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index fb1962936..a9e012e6b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "llama_cpp_python"
-version = "0.1.69"
+version = "0.1.70"
 description = "Python bindings for the llama.cpp library"
 authors = ["Andrei Betlen <abetlen@gmail.com>"]
 license = "MIT"
diff --git a/setup.py b/setup.py
index baaabcc47..b8acedb5a 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@
     description="A Python wrapper for llama.cpp",
     long_description=long_description,
     long_description_content_type="text/markdown",
-    version="0.1.69",
+    version="0.1.70",
     author="Andrei Betlen",
     author_email="abetlen@gmail.com",
     license="MIT",

From 8e0f6253db0e8aa30bcc90fc26d49d221d003070 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sun, 9 Jul 2023 18:20:04 -0400
Subject: [PATCH 33/48] Bump version

---
 CHANGELOG.md   | 2 ++
 pyproject.toml | 2 +-
 setup.py       | 2 +-
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 40974132b..8b5db37b0 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.1.70]
+
 ### Fixed
 
 - (Llama.create_completion) Revert change so that `max_tokens` is not truncated to `context_size` in `create_completion`
diff --git a/pyproject.toml b/pyproject.toml
index fb1962936..a9e012e6b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "llama_cpp_python"
-version = "0.1.69"
+version = "0.1.70"
 description = "Python bindings for the llama.cpp library"
 authors = ["Andrei Betlen <abetlen@gmail.com>"]
 license = "MIT"
diff --git a/setup.py b/setup.py
index baaabcc47..b8acedb5a 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@
     description="A Python wrapper for llama.cpp",
     long_description=long_description,
     long_description_content_type="text/markdown",
-    version="0.1.69",
+    version="0.1.70",
     author="Andrei Betlen",
     author_email="abetlen@gmail.com",
     license="MIT",

From 3f8f276f9f79ec4394ba1b73f4d5f0afb11e2d96 Mon Sep 17 00:00:00 2001
From: randoentity <137087500+randoentity@users.noreply.github.com>
Date: Sun, 9 Jul 2023 09:05:16 +0200
Subject: [PATCH 34/48] Add bindings for custom_rope

---
 llama_cpp/llama.py     | 6 ++++++
 llama_cpp/llama_cpp.py | 2 ++
 vendor/llama.cpp       | 2 +-
 3 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index edb68c9e5..ada6d695b 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -205,6 +205,8 @@ def __init__(
         model_path: str,
         # NOTE: These parameters are likely to change in the future.
         n_ctx: int = 512,
+        rope_freq_base: float = 10000.0,
+        rope_freq_scale: float = 1.0,
         n_parts: int = -1,
         n_gpu_layers: int = 0,
         seed: int = 1337,
@@ -227,6 +229,8 @@ def __init__(
         Args:
             model_path: Path to the model.
             n_ctx: Maximum context size.
+            rope_freq_base: RoPE base frequency.
+            rope_freq_scale: RoPE frequency scale.
             n_parts: Number of parts to split the model into. If -1, the number of parts is automatically determined.
             seed: Random seed. -1 for random.
             f16_kv: Use half-precision for key/value cache.
@@ -253,6 +257,8 @@ def __init__(
 
         self.params = llama_cpp.llama_context_default_params()
         self.params.n_ctx = n_ctx
+        self.params.rope_freq_base = rope_freq_base
+        self.params.rope_freq_scale = rope_freq_scale
         self.params.n_gpu_layers = n_gpu_layers
         self.params.seed = seed
         self.params.f16_kv = f16_kv
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index 17c631961..320c48b9c 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -184,6 +184,8 @@ class llama_context_params(Structure):
     _fields_ = [
         ("seed", c_uint32),
         ("n_ctx", c_int32),
+        ("rope_freq_base", c_float),
+        ("rope_freq_scale", c_float),
         ("n_batch", c_int32),
         ("n_gpu_layers", c_int32),
         ("main_gpu", c_int32),
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 1d1630996..a3b4d9328 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 1d1630996920f889cdc08de26cebf2415958540e
+Subproject commit a3b4d932859f4e51ed716bfa1f07e2d2eede2c23

From 7bb0024cd0c12d0d36207172410f13e1d343eeac Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Wed, 12 Jul 2023 19:31:43 -0400
Subject: [PATCH 35/48] Fix uvicorn dependency

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index b8acedb5a..ab5d825d4 100644
--- a/setup.py
+++ b/setup.py
@@ -18,7 +18,7 @@
     packages=["llama_cpp", "llama_cpp.server"],
     install_requires=["typing-extensions>=4.5.0", "numpy>=1.20.0", "diskcache>=5.6.1"],
     extras_require={
-        "server": ["uvicorn>=0.22.1", "fastapi>=0.100.0", "pydantic-settings>=2.0.1", "sse-starlette>=1.6.1"],
+        "server": ["uvicorn>=0.22.0", "fastapi>=0.100.0", "pydantic-settings>=2.0.1", "sse-starlette>=1.6.1"],
     },
     python_requires=">=3.7",
     classifiers=[

From 896ab7b88a45768dcb0e6038ed6ec8cbdd88a634 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Thu, 13 Jul 2023 23:24:55 -0400
Subject: [PATCH 36/48] Update llama.cpp

---
 llama_cpp/llama_cpp.py | 55 +++++++++++++++++++++++++++++++++++++-----
 vendor/llama.cpp       |  2 +-
 2 files changed, 50 insertions(+), 7 deletions(-)

diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index 17c631961..b5bab56a3 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -326,13 +326,23 @@ def llama_mlock_supported() -> bool:
 # // Initialize the llama + ggml backend
 # // If numa is true, use NUMA optimizations
 # // Call once at the start of the program
-# LLAMA_API void llama_init_backend(bool numa);
-def llama_init_backend(numa: c_bool):
-    return _lib.llama_init_backend(numa)
+# LLAMA_API void llama_backend_init(bool numa);
+def llama_backend_init(numa: c_bool):
+    return _lib.llama_backend_init(numa)
 
 
-_lib.llama_init_backend.argtypes = [c_bool]
-_lib.llama_init_backend.restype = None
+_lib.llama_backend_init.argtypes = [c_bool]
+_lib.llama_backend_init.restype = None
+
+
+# // Call once at the end of the program - currently only used for MPI
+# LLAMA_API void llama_backend_free();
+def llama_backend_free():
+    return _lib.llama_backend_free()
+
+
+_lib.llama_backend_free.argtypes = []
+_lib.llama_backend_free.restype = None
 
 
 # LLAMA_API struct llama_model * llama_load_model_from_file(
@@ -819,6 +829,39 @@ def llama_sample_frequency_and_presence_penalties(
 _lib.llama_sample_frequency_and_presence_penalties.restype = None
 
 
+# /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
+# /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
+# /// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
+# /// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
+# /// @params smooth_factor Smooth factor between guidance logits and original logits. 1.0f means only use guidance logits. 0.0f means only original logits.
+# LLAMA_API void llama_sample_classifier_free_guidance(
+#             struct llama_context * ctx,
+#         llama_token_data_array * candidates,
+#             struct llama_context * guidance_ctx,
+#                             float   scale,
+#                             float   smooth_factor);
+def llama_sample_classifier_free_guidance(
+    ctx: llama_context_p,
+    candidates,  # type: _Pointer[llama_token_data_array]
+    guidance_ctx: llama_context_p,
+    scale: c_float,
+    smooth_factor: c_float,
+):
+    return _lib.llama_sample_classifier_free_guidance(
+        ctx, candidates, guidance_ctx, scale, smooth_factor
+    )
+
+
+_lib.llama_sample_classifier_free_guidance.argtypes = [
+    llama_context_p,
+    llama_token_data_array_p,
+    llama_context_p,
+    c_float,
+    c_float,
+]
+_lib.llama_sample_classifier_free_guidance.restype = None
+
+
 # @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
 # LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
 def llama_sample_softmax(
@@ -1063,5 +1106,5 @@ def llama_print_system_info() -> bytes:
 _llama_initialized = False
 
 if not _llama_initialized:
-    llama_init_backend(c_bool(False))
+    llama_backend_init(c_bool(False))
     _llama_initialized = True
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 1d1630996..32c541163 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 1d1630996920f889cdc08de26cebf2415958540e
+Subproject commit 32c54116318929c90fd7ae814cf9b5232cd44c36

From de4cc5a233952e0dede642702f3170cd1bae5869 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Thu, 13 Jul 2023 23:25:12 -0400
Subject: [PATCH 37/48] bugfix: pydantic v2 fields

---
 llama_cpp/server/app.py | 108 +++++++++++++++++++---------------------
 1 file changed, 50 insertions(+), 58 deletions(-)

diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index ffd07fa6b..202a06ddc 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -31,9 +31,7 @@ class Settings(BaseSettings):
         ge=0,
         description="The number of layers to put on the GPU. The rest will be on the CPU.",
     )
-    seed: int = Field(
-        default=1337, description="Random seed. -1 for random."
-    )
+    seed: int = Field(default=1337, description="Random seed. -1 for random.")
     n_batch: int = Field(
         default=512, ge=1, description="The batch size to use per eval."
     )
@@ -80,12 +78,8 @@ class Settings(BaseSettings):
     verbose: bool = Field(
         default=True, description="Whether to print debug information."
     )
-    host: str = Field(
-        default="localhost", description="Listen address"
-    )
-    port: int = Field(
-        default=8000, description="Listen port"
-    )
+    host: str = Field(default="localhost", description="Listen address")
+    port: int = Field(default=8000, description="Listen port")
     interrupt_requests: bool = Field(
         default=True,
         description="Whether to interrupt requests when a new request is received.",
@@ -178,7 +172,7 @@ def get_settings():
     yield settings
 
 
-model_field = Field(description="The model to use for generating completions.")
+model_field = Field(description="The model to use for generating completions.", default=None)
 
 max_tokens_field = Field(
     default=16, ge=1, le=2048, description="The maximum number of tokens to generate."
@@ -242,21 +236,18 @@ def get_settings():
     default=0,
     ge=0,
     le=2,
-    description="Enable Mirostat constant-perplexity algorithm of the specified version (1 or 2; 0 = disabled)"
+    description="Enable Mirostat constant-perplexity algorithm of the specified version (1 or 2; 0 = disabled)",
 )
 
 mirostat_tau_field = Field(
     default=5.0,
     ge=0.0,
     le=10.0,
-    description="Mirostat target entropy, i.e. the target perplexity - lower values produce focused and coherent text, larger values produce more diverse and less coherent text"
+    description="Mirostat target entropy, i.e. the target perplexity - lower values produce focused and coherent text, larger values produce more diverse and less coherent text",
 )
 
 mirostat_eta_field = Field(
-    default=0.1,
-    ge=0.001,
-    le=1.0,
-    description="Mirostat learning rate"
+    default=0.1, ge=0.001, le=1.0, description="Mirostat learning rate"
 )
 
 
@@ -294,22 +285,23 @@ class CreateCompletionRequest(BaseModel):
     model: Optional[str] = model_field
     n: Optional[int] = 1
     best_of: Optional[int] = 1
-    user: Optional[str] = Field(None)
+    user: Optional[str] = Field(default=None)
 
     # llama.cpp specific parameters
     top_k: int = top_k_field
     repeat_penalty: float = repeat_penalty_field
     logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None)
 
-    class Config:
-        schema_extra = {
-            "example": {
-                "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n",
-                "stop": ["\n", "###"],
-            }
+    model_config = {
+        "json_schema_extra": {
+            "examples": [
+                {
+                    "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n",
+                    "stop": ["\n", "###"],
+                }
+            ]
         }
-
-
+    }
 
 
 def make_logit_bias_processor(
@@ -328,7 +320,7 @@ def make_logit_bias_processor(
 
     elif logit_bias_type == "tokens":
         for token, score in logit_bias.items():
-            token = token.encode('utf-8')
+            token = token.encode("utf-8")
             for input_id in llama.tokenize(token, add_bos=False):
                 to_bias[input_id] = score
 
@@ -352,7 +344,7 @@ async def create_completion(
     request: Request,
     body: CreateCompletionRequest,
     llama: llama_cpp.Llama = Depends(get_llama),
-):
+) -> llama_cpp.Completion:
     if isinstance(body.prompt, list):
         assert len(body.prompt) <= 1
         body.prompt = body.prompt[0] if len(body.prompt) > 0 else ""
@@ -364,7 +356,7 @@ async def create_completion(
         "logit_bias_type",
         "user",
     }
-    kwargs = body.dict(exclude=exclude)
+    kwargs = body.model_dump(exclude=exclude)
 
     if body.logit_bias is not None:
         kwargs['logits_processor'] = llama_cpp.LogitsProcessorList([
@@ -396,7 +388,7 @@ async def event_publisher(inner_send_chan: MemoryObjectSendStream):
 
         return EventSourceResponse(
             recv_chan, data_sender_callable=partial(event_publisher, send_chan)
-        )
+        ) # type: ignore
     else:
         completion: llama_cpp.Completion = await run_in_threadpool(llama, **kwargs)  # type: ignore
         return completion
@@ -405,16 +397,17 @@ async def event_publisher(inner_send_chan: MemoryObjectSendStream):
 class CreateEmbeddingRequest(BaseModel):
     model: Optional[str] = model_field
     input: Union[str, List[str]] = Field(description="The input to embed.")
-    user: Optional[str]
-
-    class Config:
-        schema_extra = {
-            "example": {
-                "input": "The food was delicious and the waiter...",
-            }
+    user: Optional[str] = Field(default=None)
+
+    model_config = {
+        "json_schema_extra": {
+            "examples": [
+                {
+                    "input": "The food was delicious and the waiter...",
+                }
+            ]
         }
-
-
+    }
 
 
 @router.post(
@@ -424,7 +417,7 @@ async def create_embedding(
     request: CreateEmbeddingRequest, llama: llama_cpp.Llama = Depends(get_llama)
 ):
     return await run_in_threadpool(
-        llama.create_embedding, **request.dict(exclude={"user"})
+        llama.create_embedding, **request.model_dump(exclude={"user"})
     )
 
 
@@ -461,21 +454,22 @@ class CreateChatCompletionRequest(BaseModel):
     repeat_penalty: float = repeat_penalty_field
     logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None)
 
-    class Config:
-        schema_extra = {
-            "example": {
-                "messages": [
-                    ChatCompletionRequestMessage(
-                        role="system", content="You are a helpful assistant."
-                    ),
-                    ChatCompletionRequestMessage(
-                        role="user", content="What is the capital of France?"
-                    ),
-                ]
-            }
+    model_config = {
+        "json_schema_extra": {
+            "examples": [
+                {
+                    "messages": [
+                        ChatCompletionRequestMessage(
+                            role="system", content="You are a helpful assistant."
+                        ).model_dump(),
+                        ChatCompletionRequestMessage(
+                            role="user", content="What is the capital of France?"
+                        ).model_dump(),
+                    ]
+                }
+            ]
         }
-
-
+    }
 
 
 @router.post(
@@ -486,14 +480,14 @@ async def create_chat_completion(
     body: CreateChatCompletionRequest,
     llama: llama_cpp.Llama = Depends(get_llama),
     settings: Settings = Depends(get_settings),
-) -> Union[llama_cpp.ChatCompletion]: # type: ignore
+) -> llama_cpp.ChatCompletion:
     exclude = {
         "n",
         "logit_bias",
         "logit_bias_type",
         "user",
     }
-    kwargs = body.dict(exclude=exclude)
+    kwargs = body.model_dump(exclude=exclude)
 
     if body.logit_bias is not None:
         kwargs['logits_processor'] = llama_cpp.LogitsProcessorList([
@@ -526,7 +520,7 @@ async def event_publisher(inner_send_chan: MemoryObjectSendStream):
         return EventSourceResponse(
             recv_chan,
             data_sender_callable=partial(event_publisher, send_chan),
-        )
+        ) # type: ignore
     else:
         completion: llama_cpp.ChatCompletion = await run_in_threadpool(
             llama.create_chat_completion, **kwargs  # type: ignore
@@ -546,8 +540,6 @@ class ModelList(TypedDict):
     data: List[ModelData]
 
 
-
-
 @router.get("/v1/models")
 async def get_models(
     settings: Settings = Depends(get_settings),

From 6705f9b6c6b3369481c4e2e0e15d0f1af7a96eff Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Thu, 13 Jul 2023 23:32:06 -0400
Subject: [PATCH 38/48] Bump version

---
 CHANGELOG.md   | 10 ++++++++++
 pyproject.toml |  2 +-
 setup.py       |  2 +-
 3 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8b5db37b0..47b55a73d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.1.71]
+
+### Added
+
+- (llama.cpp) Update llama.cpp
+
+### Fixed
+
+- (server) Fix several pydantic v2 migration bugs
+
 ## [0.1.70]
 
 ### Fixed
diff --git a/pyproject.toml b/pyproject.toml
index a9e012e6b..1cff2318c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "llama_cpp_python"
-version = "0.1.70"
+version = "0.1.71"
 description = "Python bindings for the llama.cpp library"
 authors = ["Andrei Betlen <abetlen@gmail.com>"]
 license = "MIT"
diff --git a/setup.py b/setup.py
index ab5d825d4..71af72c44 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@
     description="A Python wrapper for llama.cpp",
     long_description=long_description,
     long_description_content_type="text/markdown",
-    version="0.1.70",
+    version="0.1.71",
     author="Andrei Betlen",
     author_email="abetlen@gmail.com",
     license="MIT",

From e6c67c8f7d0e6cb85e27a4efb53569a6c304a344 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 14 Jul 2023 16:40:31 -0400
Subject: [PATCH 39/48] Update llama.cpp

---
 llama_cpp/llama_cpp.py | 74 ++++++++++++++++++++++++++++++++++++++++--
 vendor/llama.cpp       |  2 +-
 2 files changed, 73 insertions(+), 3 deletions(-)

diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index b5bab56a3..04de04663 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -656,6 +656,22 @@ def llama_tokenize(
 _lib.llama_tokenize.restype = c_int
 
 
+# LLAMA_API int llama_tokenize_with_model(
+#     const struct llama_model * model,
+#                     const char * text,
+#                     llama_token * tokens,
+#                             int   n_max_tokens,
+#                         bool   add_bos);
+def llama_tokenize_with_model(
+    model: llama_model_p,
+    text: bytes,
+    tokens,  # type: Array[llama_token]
+    n_max_tokens: c_int,
+    add_bos: c_bool,
+) -> int:
+    return _lib.llama_tokenize_with_model(model, text, tokens, n_max_tokens, add_bos)
+
+
 # LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
 def llama_n_vocab(ctx: llama_context_p) -> int:
     return _lib.llama_n_vocab(ctx)
@@ -683,6 +699,33 @@ def llama_n_embd(ctx: llama_context_p) -> int:
 _lib.llama_n_embd.restype = c_int
 
 
+# LLAMA_API int llama_n_vocab_from_model(const struct llama_model * model);
+def llama_n_vocab_from_model(model: llama_model_p) -> int:
+    return _lib.llama_n_vocab_from_model(model)
+
+
+_lib.llama_n_vocab_from_model.argtypes = [llama_model_p]
+_lib.llama_n_vocab_from_model.restype = c_int
+
+
+# LLAMA_API int llama_n_ctx_from_model  (const struct llama_model * model);
+def llama_n_ctx_from_model(model: llama_model_p) -> int:
+    return _lib.llama_n_ctx_from_model(model)
+
+
+_lib.llama_n_ctx_from_model.argtypes = [llama_model_p]
+_lib.llama_n_ctx_from_model.restype = c_int
+
+
+# LLAMA_API int llama_n_embd_from_model (const struct llama_model * model);
+def llama_n_embd_from_model(model: llama_model_p) -> int:
+    return _lib.llama_n_embd_from_model(model)
+
+
+_lib.llama_n_embd_from_model.argtypes = [llama_model_p]
+_lib.llama_n_embd_from_model.restype = c_int
+
+
 # // Get the vocabulary as output parameters.
 # // Returns number of results.
 # LLAMA_API int llama_get_vocab(
@@ -703,6 +746,20 @@ def llama_get_vocab(
 _lib.llama_get_vocab.restype = c_int
 
 
+# LLAMA_API int llama_get_vocab_from_model(
+#             const struct llama_model * model,
+#                         const char * * strings,
+#                                 float * scores,
+#                                 int   capacity);
+def llama_get_vocab_from_model(
+    model: llama_model_p,
+    strings,  # type: Array[c_char_p] # type: ignore
+    scores,  # type: Array[c_float] # type: ignore
+    capacity: c_int,
+) -> int:
+    return _lib.llama_get_vocab_from_model(model, strings, scores, capacity)
+
+
 # Token logits obtained from the last call to llama_eval()
 # The logits for the last token are stored in the last row
 # Can be mutated in order to change the probabilities of the next token
@@ -732,8 +789,10 @@ def llama_get_embeddings(
 _lib.llama_get_embeddings.restype = c_float_p
 
 
-# Token Id -> String. Uses the vocabulary in the provided context
-# LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
+# // Token Id -> String. Uses the vocabulary in the provided context
+# LLAMA_API const char * llama_token_to_str(
+#         const struct llama_context * ctx,
+#                         llama_token   token);
 def llama_token_to_str(ctx: llama_context_p, token: llama_token) -> bytes:
     return _lib.llama_token_to_str(ctx, token)
 
@@ -741,6 +800,17 @@ def llama_token_to_str(ctx: llama_context_p, token: llama_token) -> bytes:
 _lib.llama_token_to_str.argtypes = [llama_context_p, llama_token]
 _lib.llama_token_to_str.restype = c_char_p
 
+
+# LLAMA_API const char * llama_token_to_str_with_model(
+#             const struct llama_model * model,
+#                         llama_token   token);
+def llama_token_to_str_with_model(model: llama_model_p, token: llama_token) -> bytes:
+    return _lib.llama_token_to_str_with_model(model, token)
+
+
+_lib.llama_token_to_str_with_model.argtypes = [llama_model_p, llama_token]
+_lib.llama_token_to_str_with_model.restype = c_char_p
+
 # Special tokens
 
 
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index 32c541163..a6803cab9 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit 32c54116318929c90fd7ae814cf9b5232cd44c36
+Subproject commit a6803cab946c817fb7aaf2a40b317f5d3e373bd1

From 25b3494e11cc0a51bbfacc86353eabd9f1d6a147 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 14 Jul 2023 16:40:53 -0400
Subject: [PATCH 40/48] Minor fix to tensor_split parameter

---
 llama_cpp/llama.py | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index c5869edfd..849e7752e 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -207,7 +207,6 @@ def __init__(
         n_ctx: int = 512,
         n_parts: int = -1,
         n_gpu_layers: int = 0,
-        tensor_split: list[float] = None,
         seed: int = 1337,
         f16_kv: bool = True,
         logits_all: bool = False,
@@ -221,6 +220,7 @@ def __init__(
         lora_base: Optional[str] = None,
         lora_path: Optional[str] = None,
         low_vram: bool = False,
+        tensor_split: Optional[List[float]] = None,
         verbose: bool = True,
     ):
         """Load a llama.cpp model from `model_path`.
@@ -241,6 +241,7 @@ def __init__(
             last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
             lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.
             lora_path: Path to a LoRA file to apply to the model.
+            tensor_split: List of floats to split the model across multiple GPUs. If None, the model is not split.
             verbose: Print verbose output to stderr.
 
         Raises:
@@ -249,12 +250,6 @@ def __init__(
         Returns:
             A Llama instance.
         """
-        if tensor_split is None:
-            tensor_split = [0.0] * llama_cpp.LLAMA_MAX_DEVICES.value
-
-        #Type conversion and expand the list to the length of LLAMA_MAX_DEVICES
-        FloatArray = ctypes.c_float * llama_cpp.LLAMA_MAX_DEVICES.value
-        c_tensor_split = FloatArray(*tensor_split)
 
         self.verbose = verbose
         self.model_path = model_path
@@ -262,7 +257,6 @@ def __init__(
         self.params = llama_cpp.llama_context_default_params()
         self.params.n_ctx = n_ctx
         self.params.n_gpu_layers = n_gpu_layers
-        self.params.tensor_split = c_tensor_split
         self.params.seed = seed
         self.params.f16_kv = f16_kv
         self.params.logits_all = logits_all
@@ -272,6 +266,15 @@ def __init__(
         self.params.embedding = embedding
         self.params.low_vram = low_vram
 
+        self.tensor_split = tensor_split
+        self._c_tensor_split = None
+
+        if self.tensor_split is not None:
+            #Type conversion and expand the list to the length of LLAMA_MAX_DEVICES
+            FloatArray = ctypes.c_float * llama_cpp.LLAMA_MAX_DEVICES.value
+            self._c_tensor_split = FloatArray(*tensor_split) # keep a reference to the array so it is not gc'd
+            self.params.tensor_split = self._c_tensor_split
+
         self.last_n_tokens_size = last_n_tokens_size
         self.n_batch = min(n_ctx, n_batch)
 
@@ -1499,7 +1502,6 @@ def __getstate__(self):
             model_path=self.model_path,
             n_ctx=self.params.n_ctx,
             n_gpu_layers=self.params.n_gpu_layers,
-            tensor_split=self.params.tensor_split,
             seed=self.params.seed,
             f16_kv=self.params.f16_kv,
             logits_all=self.params.logits_all,
@@ -1513,6 +1515,7 @@ def __getstate__(self):
             n_threads=self.n_threads,
             lora_base=self.lora_base,
             lora_path=self.lora_path,
+            tensor_split=self.tensor_split,
             ### DEPRECATED ###
             n_parts=self.n_parts,
             ### DEPRECATED ###
@@ -1524,7 +1527,6 @@ def __setstate__(self, state):
             n_ctx=state["n_ctx"],
             n_parts=state["n_parts"],
             n_gpu_layers=state["n_gpu_layers"],
-            tensor_split=state["tensor_split"],
             seed=state["seed"],
             f16_kv=state["f16_kv"],
             logits_all=state["logits_all"],
@@ -1538,6 +1540,7 @@ def __setstate__(self, state):
             last_n_tokens_size=state["last_n_tokens_size"],
             lora_base=state["lora_base"],
             lora_path=state["lora_path"],
+            tensor_split=state["tensor_split"],
             verbose=state["verbose"],
         )
 

From 118b7f6d5c2cacab8d2c4a4c2d44b6a4eda03b37 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 14 Jul 2023 16:52:48 -0400
Subject: [PATCH 41/48] fix: tensor_split should be optional list

---
 llama_cpp/server/app.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index 8dc5a0f03..eaa6f44a9 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -31,7 +31,7 @@ class Settings(BaseSettings):
         ge=0,
         description="The number of layers to put on the GPU. The rest will be on the CPU.",
     )
-    tensor_split: List[float] = Field(
+    tensor_split: Optional[List[float]] = Field(
         default=None,
         description="Split layers across multiple GPUs in proportion.",
     )

From f72b6e9b732396ac29fb44e5cc43d4743d1b6fd9 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sat, 15 Jul 2023 15:01:08 -0400
Subject: [PATCH 42/48] Update llama.cpp

---
 llama_cpp/llama_cpp.py | 8 +++++++-
 vendor/llama.cpp       | 2 +-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index 04de04663..aef4f65c7 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -165,12 +165,16 @@ class llama_token_data_array(Structure):
 #     int32_t  n_gpu_layers;                 // number of layers to store in VRAM
 #     int32_t  main_gpu;                     // the GPU that is used for scratch and small tensors
 #     float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
+
+#     // ref: https://github.com/ggerganov/llama.cpp/pull/2054
+#     float    rope_freq_base;  // RoPE base frequency
+#     float    rope_freq_scale; // RoPE frequency scaling factor
+
 #     // called with a progress value between 0 and 1, pass NULL to disable
 #     llama_progress_callback progress_callback;
 #     // context pointer passed to the progress callback
 #     void * progress_callback_user_data;
 
-
 #     // Keep the booleans together to avoid misalignment during copy-by-value.
 #     bool low_vram;   // if true, reduce VRAM usage at the cost of performance
 #     bool f16_kv;     // use fp16 for KV cache
@@ -188,6 +192,8 @@ class llama_context_params(Structure):
         ("n_gpu_layers", c_int32),
         ("main_gpu", c_int32),
         ("tensor_split", c_float * LLAMA_MAX_DEVICES.value),
+        ("rope_freq_base", c_float),
+        ("rope_freq_scale", c_float),
         ("progress_callback", llama_progress_callback),
         ("progress_callback_user_data", c_void_p),
         ("low_vram", c_bool),
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index a6803cab9..6e7cca404 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit a6803cab946c817fb7aaf2a40b317f5d3e373bd1
+Subproject commit 6e7cca404748dd4b1a3affd0d1296e37f4ac0a6f

From f0797a6054d97530663f5831ef498f45ceeda113 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sat, 15 Jul 2023 15:11:01 -0400
Subject: [PATCH 43/48] Merge branch main into custom_rope

---
 CHANGELOG.md            |  10 +++
 README.md               |   1 +
 llama_cpp/llama.py      |  16 ++++-
 llama_cpp/llama_cpp.py  | 137 +++++++++++++++++++++++++++++++++++++---
 llama_cpp/server/app.py | 109 ++++++++++++++++----------------
 pyproject.toml          |   2 +-
 setup.py                |   4 +-
 vendor/llama.cpp        |   2 +-
 8 files changed, 212 insertions(+), 69 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8b5db37b0..47b55a73d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.1.71]
+
+### Added
+
+- (llama.cpp) Update llama.cpp
+
+### Fixed
+
+- (server) Fix several pydantic v2 migration bugs
+
 ## [0.1.70]
 
 ### Fixed
diff --git a/README.md b/README.md
index 0322c73a3..1f3dcb5ab 100644
--- a/README.md
+++ b/README.md
@@ -135,6 +135,7 @@ A Docker image is available on [GHCR](https://ghcr.io/abetlen/llama-cpp-python).
 ```bash
 docker run --rm -it -p 8000:8000 -v /path/to/models:/models -e MODEL=/models/ggml-model-name.bin ghcr.io/abetlen/llama-cpp-python:latest
 ```
+[Docker on termux (requires root)](https://gist.github.com/FreddieOliveira/efe850df7ff3951cb62d74bd770dce27) is currently the only known way to run this on phones, see [termux support issue](https://github.com/abetlen/llama-cpp-python/issues/389) 
 
 ## Low-level API
 
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index ada6d695b..7bda0461b 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -19,6 +19,7 @@
 from collections import deque, OrderedDict
 
 import diskcache
+import ctypes
 
 from . import llama_cpp
 from .llama_types import *
@@ -26,7 +27,6 @@
 import numpy as np
 import numpy.typing as npt
 
-
 class BaseLlamaCache(ABC):
     """Base cache class for a llama.cpp model."""
 
@@ -222,6 +222,7 @@ def __init__(
         lora_base: Optional[str] = None,
         lora_path: Optional[str] = None,
         low_vram: bool = False,
+        tensor_split: Optional[List[float]] = None,
         verbose: bool = True,
     ):
         """Load a llama.cpp model from `model_path`.
@@ -244,6 +245,7 @@ def __init__(
             last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
             lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.
             lora_path: Path to a LoRA file to apply to the model.
+            tensor_split: List of floats to split the model across multiple GPUs. If None, the model is not split.
             verbose: Print verbose output to stderr.
 
         Raises:
@@ -252,6 +254,7 @@ def __init__(
         Returns:
             A Llama instance.
         """
+
         self.verbose = verbose
         self.model_path = model_path
 
@@ -269,6 +272,15 @@ def __init__(
         self.params.embedding = embedding
         self.params.low_vram = low_vram
 
+        self.tensor_split = tensor_split
+        self._c_tensor_split = None
+
+        if self.tensor_split is not None:
+            #Type conversion and expand the list to the length of LLAMA_MAX_DEVICES
+            FloatArray = ctypes.c_float * llama_cpp.LLAMA_MAX_DEVICES.value
+            self._c_tensor_split = FloatArray(*tensor_split) # keep a reference to the array so it is not gc'd
+            self.params.tensor_split = self._c_tensor_split
+
         self.last_n_tokens_size = last_n_tokens_size
         self.n_batch = min(n_ctx, n_batch)
 
@@ -1509,6 +1521,7 @@ def __getstate__(self):
             n_threads=self.n_threads,
             lora_base=self.lora_base,
             lora_path=self.lora_path,
+            tensor_split=self.tensor_split,
             ### DEPRECATED ###
             n_parts=self.n_parts,
             ### DEPRECATED ###
@@ -1533,6 +1546,7 @@ def __setstate__(self, state):
             last_n_tokens_size=state["last_n_tokens_size"],
             lora_base=state["lora_base"],
             lora_path=state["lora_path"],
+            tensor_split=state["tensor_split"],
             verbose=state["verbose"],
         )
 
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index 320c48b9c..32f70f005 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -165,12 +165,16 @@ class llama_token_data_array(Structure):
 #     int32_t  n_gpu_layers;                 // number of layers to store in VRAM
 #     int32_t  main_gpu;                     // the GPU that is used for scratch and small tensors
 #     float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
+
+#     // ref: https://github.com/ggerganov/llama.cpp/pull/2054
+#     float    rope_freq_base;  // RoPE base frequency
+#     float    rope_freq_scale; // RoPE frequency scaling factor
+
 #     // called with a progress value between 0 and 1, pass NULL to disable
 #     llama_progress_callback progress_callback;
 #     // context pointer passed to the progress callback
 #     void * progress_callback_user_data;
 
-
 #     // Keep the booleans together to avoid misalignment during copy-by-value.
 #     bool low_vram;   // if true, reduce VRAM usage at the cost of performance
 #     bool f16_kv;     // use fp16 for KV cache
@@ -190,6 +194,8 @@ class llama_context_params(Structure):
         ("n_gpu_layers", c_int32),
         ("main_gpu", c_int32),
         ("tensor_split", c_float * LLAMA_MAX_DEVICES.value),
+        ("rope_freq_base", c_float),
+        ("rope_freq_scale", c_float),
         ("progress_callback", llama_progress_callback),
         ("progress_callback_user_data", c_void_p),
         ("low_vram", c_bool),
@@ -328,13 +334,23 @@ def llama_mlock_supported() -> bool:
 # // Initialize the llama + ggml backend
 # // If numa is true, use NUMA optimizations
 # // Call once at the start of the program
-# LLAMA_API void llama_init_backend(bool numa);
-def llama_init_backend(numa: c_bool):
-    return _lib.llama_init_backend(numa)
+# LLAMA_API void llama_backend_init(bool numa);
+def llama_backend_init(numa: c_bool):
+    return _lib.llama_backend_init(numa)
+
+
+_lib.llama_backend_init.argtypes = [c_bool]
+_lib.llama_backend_init.restype = None
+
 
+# // Call once at the end of the program - currently only used for MPI
+# LLAMA_API void llama_backend_free();
+def llama_backend_free():
+    return _lib.llama_backend_free()
 
-_lib.llama_init_backend.argtypes = [c_bool]
-_lib.llama_init_backend.restype = None
+
+_lib.llama_backend_free.argtypes = []
+_lib.llama_backend_free.restype = None
 
 
 # LLAMA_API struct llama_model * llama_load_model_from_file(
@@ -648,6 +664,22 @@ def llama_tokenize(
 _lib.llama_tokenize.restype = c_int
 
 
+# LLAMA_API int llama_tokenize_with_model(
+#     const struct llama_model * model,
+#                     const char * text,
+#                     llama_token * tokens,
+#                             int   n_max_tokens,
+#                         bool   add_bos);
+def llama_tokenize_with_model(
+    model: llama_model_p,
+    text: bytes,
+    tokens,  # type: Array[llama_token]
+    n_max_tokens: c_int,
+    add_bos: c_bool,
+) -> int:
+    return _lib.llama_tokenize_with_model(model, text, tokens, n_max_tokens, add_bos)
+
+
 # LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
 def llama_n_vocab(ctx: llama_context_p) -> int:
     return _lib.llama_n_vocab(ctx)
@@ -675,6 +707,33 @@ def llama_n_embd(ctx: llama_context_p) -> int:
 _lib.llama_n_embd.restype = c_int
 
 
+# LLAMA_API int llama_n_vocab_from_model(const struct llama_model * model);
+def llama_n_vocab_from_model(model: llama_model_p) -> int:
+    return _lib.llama_n_vocab_from_model(model)
+
+
+_lib.llama_n_vocab_from_model.argtypes = [llama_model_p]
+_lib.llama_n_vocab_from_model.restype = c_int
+
+
+# LLAMA_API int llama_n_ctx_from_model  (const struct llama_model * model);
+def llama_n_ctx_from_model(model: llama_model_p) -> int:
+    return _lib.llama_n_ctx_from_model(model)
+
+
+_lib.llama_n_ctx_from_model.argtypes = [llama_model_p]
+_lib.llama_n_ctx_from_model.restype = c_int
+
+
+# LLAMA_API int llama_n_embd_from_model (const struct llama_model * model);
+def llama_n_embd_from_model(model: llama_model_p) -> int:
+    return _lib.llama_n_embd_from_model(model)
+
+
+_lib.llama_n_embd_from_model.argtypes = [llama_model_p]
+_lib.llama_n_embd_from_model.restype = c_int
+
+
 # // Get the vocabulary as output parameters.
 # // Returns number of results.
 # LLAMA_API int llama_get_vocab(
@@ -695,6 +754,20 @@ def llama_get_vocab(
 _lib.llama_get_vocab.restype = c_int
 
 
+# LLAMA_API int llama_get_vocab_from_model(
+#             const struct llama_model * model,
+#                         const char * * strings,
+#                                 float * scores,
+#                                 int   capacity);
+def llama_get_vocab_from_model(
+    model: llama_model_p,
+    strings,  # type: Array[c_char_p] # type: ignore
+    scores,  # type: Array[c_float] # type: ignore
+    capacity: c_int,
+) -> int:
+    return _lib.llama_get_vocab_from_model(model, strings, scores, capacity)
+
+
 # Token logits obtained from the last call to llama_eval()
 # The logits for the last token are stored in the last row
 # Can be mutated in order to change the probabilities of the next token
@@ -724,8 +797,10 @@ def llama_get_embeddings(
 _lib.llama_get_embeddings.restype = c_float_p
 
 
-# Token Id -> String. Uses the vocabulary in the provided context
-# LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
+# // Token Id -> String. Uses the vocabulary in the provided context
+# LLAMA_API const char * llama_token_to_str(
+#         const struct llama_context * ctx,
+#                         llama_token   token);
 def llama_token_to_str(ctx: llama_context_p, token: llama_token) -> bytes:
     return _lib.llama_token_to_str(ctx, token)
 
@@ -733,6 +808,17 @@ def llama_token_to_str(ctx: llama_context_p, token: llama_token) -> bytes:
 _lib.llama_token_to_str.argtypes = [llama_context_p, llama_token]
 _lib.llama_token_to_str.restype = c_char_p
 
+
+# LLAMA_API const char * llama_token_to_str_with_model(
+#             const struct llama_model * model,
+#                         llama_token   token);
+def llama_token_to_str_with_model(model: llama_model_p, token: llama_token) -> bytes:
+    return _lib.llama_token_to_str_with_model(model, token)
+
+
+_lib.llama_token_to_str_with_model.argtypes = [llama_model_p, llama_token]
+_lib.llama_token_to_str_with_model.restype = c_char_p
+
 # Special tokens
 
 
@@ -821,6 +907,39 @@ def llama_sample_frequency_and_presence_penalties(
 _lib.llama_sample_frequency_and_presence_penalties.restype = None
 
 
+# /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
+# /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
+# /// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
+# /// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
+# /// @params smooth_factor Smooth factor between guidance logits and original logits. 1.0f means only use guidance logits. 0.0f means only original logits.
+# LLAMA_API void llama_sample_classifier_free_guidance(
+#             struct llama_context * ctx,
+#         llama_token_data_array * candidates,
+#             struct llama_context * guidance_ctx,
+#                             float   scale,
+#                             float   smooth_factor);
+def llama_sample_classifier_free_guidance(
+    ctx: llama_context_p,
+    candidates,  # type: _Pointer[llama_token_data_array]
+    guidance_ctx: llama_context_p,
+    scale: c_float,
+    smooth_factor: c_float,
+):
+    return _lib.llama_sample_classifier_free_guidance(
+        ctx, candidates, guidance_ctx, scale, smooth_factor
+    )
+
+
+_lib.llama_sample_classifier_free_guidance.argtypes = [
+    llama_context_p,
+    llama_token_data_array_p,
+    llama_context_p,
+    c_float,
+    c_float,
+]
+_lib.llama_sample_classifier_free_guidance.restype = None
+
+
 # @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
 # LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
 def llama_sample_softmax(
@@ -1065,5 +1184,5 @@ def llama_print_system_info() -> bytes:
 _llama_initialized = False
 
 if not _llama_initialized:
-    llama_init_backend(c_bool(False))
+    llama_backend_init(c_bool(False))
     _llama_initialized = True
diff --git a/llama_cpp/server/app.py b/llama_cpp/server/app.py
index ffd07fa6b..eaa6f44a9 100644
--- a/llama_cpp/server/app.py
+++ b/llama_cpp/server/app.py
@@ -31,6 +31,10 @@ class Settings(BaseSettings):
         ge=0,
         description="The number of layers to put on the GPU. The rest will be on the CPU.",
     )
+    tensor_split: Optional[List[float]] = Field(
+        default=None,
+        description="Split layers across multiple GPUs in proportion.",
+    )
     seed: int = Field(
         default=1337, description="Random seed. -1 for random."
     )
@@ -80,12 +84,8 @@ class Settings(BaseSettings):
     verbose: bool = Field(
         default=True, description="Whether to print debug information."
     )
-    host: str = Field(
-        default="localhost", description="Listen address"
-    )
-    port: int = Field(
-        default=8000, description="Listen port"
-    )
+    host: str = Field(default="localhost", description="Listen address")
+    port: int = Field(default=8000, description="Listen port")
     interrupt_requests: bool = Field(
         default=True,
         description="Whether to interrupt requests when a new request is received.",
@@ -117,6 +117,7 @@ def create_app(settings: Optional[Settings] = None):
     llama = llama_cpp.Llama(
         model_path=settings.model,
         n_gpu_layers=settings.n_gpu_layers,
+        tensor_split=settings.tensor_split,
         seed=settings.seed,
         f16_kv=settings.f16_kv,
         use_mlock=settings.use_mlock,
@@ -178,7 +179,7 @@ def get_settings():
     yield settings
 
 
-model_field = Field(description="The model to use for generating completions.")
+model_field = Field(description="The model to use for generating completions.", default=None)
 
 max_tokens_field = Field(
     default=16, ge=1, le=2048, description="The maximum number of tokens to generate."
@@ -242,21 +243,18 @@ def get_settings():
     default=0,
     ge=0,
     le=2,
-    description="Enable Mirostat constant-perplexity algorithm of the specified version (1 or 2; 0 = disabled)"
+    description="Enable Mirostat constant-perplexity algorithm of the specified version (1 or 2; 0 = disabled)",
 )
 
 mirostat_tau_field = Field(
     default=5.0,
     ge=0.0,
     le=10.0,
-    description="Mirostat target entropy, i.e. the target perplexity - lower values produce focused and coherent text, larger values produce more diverse and less coherent text"
+    description="Mirostat target entropy, i.e. the target perplexity - lower values produce focused and coherent text, larger values produce more diverse and less coherent text",
 )
 
 mirostat_eta_field = Field(
-    default=0.1,
-    ge=0.001,
-    le=1.0,
-    description="Mirostat learning rate"
+    default=0.1, ge=0.001, le=1.0, description="Mirostat learning rate"
 )
 
 
@@ -294,22 +292,23 @@ class CreateCompletionRequest(BaseModel):
     model: Optional[str] = model_field
     n: Optional[int] = 1
     best_of: Optional[int] = 1
-    user: Optional[str] = Field(None)
+    user: Optional[str] = Field(default=None)
 
     # llama.cpp specific parameters
     top_k: int = top_k_field
     repeat_penalty: float = repeat_penalty_field
     logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None)
 
-    class Config:
-        schema_extra = {
-            "example": {
-                "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n",
-                "stop": ["\n", "###"],
-            }
+    model_config = {
+        "json_schema_extra": {
+            "examples": [
+                {
+                    "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n",
+                    "stop": ["\n", "###"],
+                }
+            ]
         }
-
-
+    }
 
 
 def make_logit_bias_processor(
@@ -328,7 +327,7 @@ def make_logit_bias_processor(
 
     elif logit_bias_type == "tokens":
         for token, score in logit_bias.items():
-            token = token.encode('utf-8')
+            token = token.encode("utf-8")
             for input_id in llama.tokenize(token, add_bos=False):
                 to_bias[input_id] = score
 
@@ -352,7 +351,7 @@ async def create_completion(
     request: Request,
     body: CreateCompletionRequest,
     llama: llama_cpp.Llama = Depends(get_llama),
-):
+) -> llama_cpp.Completion:
     if isinstance(body.prompt, list):
         assert len(body.prompt) <= 1
         body.prompt = body.prompt[0] if len(body.prompt) > 0 else ""
@@ -364,7 +363,7 @@ async def create_completion(
         "logit_bias_type",
         "user",
     }
-    kwargs = body.dict(exclude=exclude)
+    kwargs = body.model_dump(exclude=exclude)
 
     if body.logit_bias is not None:
         kwargs['logits_processor'] = llama_cpp.LogitsProcessorList([
@@ -396,7 +395,7 @@ async def event_publisher(inner_send_chan: MemoryObjectSendStream):
 
         return EventSourceResponse(
             recv_chan, data_sender_callable=partial(event_publisher, send_chan)
-        )
+        ) # type: ignore
     else:
         completion: llama_cpp.Completion = await run_in_threadpool(llama, **kwargs)  # type: ignore
         return completion
@@ -405,16 +404,17 @@ async def event_publisher(inner_send_chan: MemoryObjectSendStream):
 class CreateEmbeddingRequest(BaseModel):
     model: Optional[str] = model_field
     input: Union[str, List[str]] = Field(description="The input to embed.")
-    user: Optional[str]
-
-    class Config:
-        schema_extra = {
-            "example": {
-                "input": "The food was delicious and the waiter...",
-            }
+    user: Optional[str] = Field(default=None)
+
+    model_config = {
+        "json_schema_extra": {
+            "examples": [
+                {
+                    "input": "The food was delicious and the waiter...",
+                }
+            ]
         }
-
-
+    }
 
 
 @router.post(
@@ -424,7 +424,7 @@ async def create_embedding(
     request: CreateEmbeddingRequest, llama: llama_cpp.Llama = Depends(get_llama)
 ):
     return await run_in_threadpool(
-        llama.create_embedding, **request.dict(exclude={"user"})
+        llama.create_embedding, **request.model_dump(exclude={"user"})
     )
 
 
@@ -461,21 +461,22 @@ class CreateChatCompletionRequest(BaseModel):
     repeat_penalty: float = repeat_penalty_field
     logit_bias_type: Optional[Literal["input_ids", "tokens"]] = Field(None)
 
-    class Config:
-        schema_extra = {
-            "example": {
-                "messages": [
-                    ChatCompletionRequestMessage(
-                        role="system", content="You are a helpful assistant."
-                    ),
-                    ChatCompletionRequestMessage(
-                        role="user", content="What is the capital of France?"
-                    ),
-                ]
-            }
+    model_config = {
+        "json_schema_extra": {
+            "examples": [
+                {
+                    "messages": [
+                        ChatCompletionRequestMessage(
+                            role="system", content="You are a helpful assistant."
+                        ).model_dump(),
+                        ChatCompletionRequestMessage(
+                            role="user", content="What is the capital of France?"
+                        ).model_dump(),
+                    ]
+                }
+            ]
         }
-
-
+    }
 
 
 @router.post(
@@ -486,14 +487,14 @@ async def create_chat_completion(
     body: CreateChatCompletionRequest,
     llama: llama_cpp.Llama = Depends(get_llama),
     settings: Settings = Depends(get_settings),
-) -> Union[llama_cpp.ChatCompletion]: # type: ignore
+) -> llama_cpp.ChatCompletion:
     exclude = {
         "n",
         "logit_bias",
         "logit_bias_type",
         "user",
     }
-    kwargs = body.dict(exclude=exclude)
+    kwargs = body.model_dump(exclude=exclude)
 
     if body.logit_bias is not None:
         kwargs['logits_processor'] = llama_cpp.LogitsProcessorList([
@@ -526,7 +527,7 @@ async def event_publisher(inner_send_chan: MemoryObjectSendStream):
         return EventSourceResponse(
             recv_chan,
             data_sender_callable=partial(event_publisher, send_chan),
-        )
+        ) # type: ignore
     else:
         completion: llama_cpp.ChatCompletion = await run_in_threadpool(
             llama.create_chat_completion, **kwargs  # type: ignore
@@ -546,8 +547,6 @@ class ModelList(TypedDict):
     data: List[ModelData]
 
 
-
-
 @router.get("/v1/models")
 async def get_models(
     settings: Settings = Depends(get_settings),
diff --git a/pyproject.toml b/pyproject.toml
index a9e012e6b..1cff2318c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "llama_cpp_python"
-version = "0.1.70"
+version = "0.1.71"
 description = "Python bindings for the llama.cpp library"
 authors = ["Andrei Betlen <abetlen@gmail.com>"]
 license = "MIT"
diff --git a/setup.py b/setup.py
index b8acedb5a..71af72c44 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@
     description="A Python wrapper for llama.cpp",
     long_description=long_description,
     long_description_content_type="text/markdown",
-    version="0.1.70",
+    version="0.1.71",
     author="Andrei Betlen",
     author_email="abetlen@gmail.com",
     license="MIT",
@@ -18,7 +18,7 @@
     packages=["llama_cpp", "llama_cpp.server"],
     install_requires=["typing-extensions>=4.5.0", "numpy>=1.20.0", "diskcache>=5.6.1"],
     extras_require={
-        "server": ["uvicorn>=0.22.1", "fastapi>=0.100.0", "pydantic-settings>=2.0.1", "sse-starlette>=1.6.1"],
+        "server": ["uvicorn>=0.22.0", "fastapi>=0.100.0", "pydantic-settings>=2.0.1", "sse-starlette>=1.6.1"],
     },
     python_requires=">=3.7",
     classifiers=[
diff --git a/vendor/llama.cpp b/vendor/llama.cpp
index a3b4d9328..6e7cca404 160000
--- a/vendor/llama.cpp
+++ b/vendor/llama.cpp
@@ -1 +1 @@
-Subproject commit a3b4d932859f4e51ed716bfa1f07e2d2eede2c23
+Subproject commit 6e7cca404748dd4b1a3affd0d1296e37f4ac0a6f

From bdf32df255104df1b45453431e9fa19a03220a8a Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sat, 15 Jul 2023 15:34:32 -0400
Subject: [PATCH 44/48] Add additional direnv directory to gitignore

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 36ed7f7fd..3866fb251 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,7 @@
 _skbuild/
 
 .envrc
+.direnv
 
 models/
 

From e4f9db37db5dca97f22ec53e169ea047e23462c0 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sat, 15 Jul 2023 15:34:55 -0400
Subject: [PATCH 45/48] Fix context_params struct layout

---
 llama_cpp/llama_cpp.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
index 32f70f005..aef4f65c7 100644
--- a/llama_cpp/llama_cpp.py
+++ b/llama_cpp/llama_cpp.py
@@ -188,8 +188,6 @@ class llama_context_params(Structure):
     _fields_ = [
         ("seed", c_uint32),
         ("n_ctx", c_int32),
-        ("rope_freq_base", c_float),
-        ("rope_freq_scale", c_float),
         ("n_batch", c_int32),
         ("n_gpu_layers", c_int32),
         ("main_gpu", c_int32),

From 8ab098e49dfcc5a4afbcb2f11d54efa2cb606972 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sat, 15 Jul 2023 15:35:08 -0400
Subject: [PATCH 46/48] Re-order Llama class params

---
 llama_cpp/llama.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 7bda0461b..92ca67d65 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -205,8 +205,6 @@ def __init__(
         model_path: str,
         # NOTE: These parameters are likely to change in the future.
         n_ctx: int = 512,
-        rope_freq_base: float = 10000.0,
-        rope_freq_scale: float = 1.0,
         n_parts: int = -1,
         n_gpu_layers: int = 0,
         seed: int = 1337,
@@ -223,6 +221,8 @@ def __init__(
         lora_path: Optional[str] = None,
         low_vram: bool = False,
         tensor_split: Optional[List[float]] = None,
+        rope_freq_base: float = 10000.0,
+        rope_freq_scale: float = 1.0,
         verbose: bool = True,
     ):
         """Load a llama.cpp model from `model_path`.
@@ -230,8 +230,6 @@ def __init__(
         Args:
             model_path: Path to the model.
             n_ctx: Maximum context size.
-            rope_freq_base: RoPE base frequency.
-            rope_freq_scale: RoPE frequency scale.
             n_parts: Number of parts to split the model into. If -1, the number of parts is automatically determined.
             seed: Random seed. -1 for random.
             f16_kv: Use half-precision for key/value cache.
@@ -246,6 +244,8 @@ def __init__(
             lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.
             lora_path: Path to a LoRA file to apply to the model.
             tensor_split: List of floats to split the model across multiple GPUs. If None, the model is not split.
+            rope_freq_base: Base frequency for rope sampling.
+            rope_freq_scale: Scale factor for rope sampling.
             verbose: Print verbose output to stderr.
 
         Raises:
@@ -260,8 +260,6 @@ def __init__(
 
         self.params = llama_cpp.llama_context_default_params()
         self.params.n_ctx = n_ctx
-        self.params.rope_freq_base = rope_freq_base
-        self.params.rope_freq_scale = rope_freq_scale
         self.params.n_gpu_layers = n_gpu_layers
         self.params.seed = seed
         self.params.f16_kv = f16_kv
@@ -281,6 +279,9 @@ def __init__(
             self._c_tensor_split = FloatArray(*tensor_split) # keep a reference to the array so it is not gc'd
             self.params.tensor_split = self._c_tensor_split
 
+        self.params.rope_freq_base = rope_freq_base
+        self.params.rope_freq_scale = rope_freq_scale
+
         self.last_n_tokens_size = last_n_tokens_size
         self.n_batch = min(n_ctx, n_batch)
 

From 6d8892fe64ca7eadd503ae01f93fbcd9ff3806dd Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Sat, 15 Jul 2023 17:13:55 -0400
Subject: [PATCH 47/48] Bump version

---
 CHANGELOG.md   | 6 ++++++
 pyproject.toml | 2 +-
 setup.py       | 2 +-
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 47b55a73d..c7723c529 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.1.72]
+
+### Added
+
+- (llama.cpp) Update llama.cpp added custom_rope for extended context lengths
+
 ## [0.1.71]
 
 ### Added
diff --git a/pyproject.toml b/pyproject.toml
index 1cff2318c..7839a869a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "llama_cpp_python"
-version = "0.1.71"
+version = "0.1.72"
 description = "Python bindings for the llama.cpp library"
 authors = ["Andrei Betlen <abetlen@gmail.com>"]
 license = "MIT"
diff --git a/setup.py b/setup.py
index 71af72c44..9b4de9785 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@
     description="A Python wrapper for llama.cpp",
     long_description=long_description,
     long_description_content_type="text/markdown",
-    version="0.1.71",
+    version="0.1.72",
     author="Andrei Betlen",
     author_email="abetlen@gmail.com",
     license="MIT",

From c6fb8764e6daa42fee8659b6e169ec40b289efeb Mon Sep 17 00:00:00 2001
From: Mozer <mo3rnt@gmail.com>
Date: Sun, 16 Jul 2023 13:41:56 +0300
Subject: [PATCH 48/48] Update llama.py

---
 llama_cpp/llama.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
index 92ca67d65..ed27476e7 100644
--- a/llama_cpp/llama.py
+++ b/llama_cpp/llama.py
@@ -221,8 +221,8 @@ def __init__(
         lora_path: Optional[str] = None,
         low_vram: bool = False,
         tensor_split: Optional[List[float]] = None,
-        rope_freq_base: float = 10000.0,
-        rope_freq_scale: float = 1.0,
+        rope_freq_base: float = 80000.0,
+        rope_freq_scale: float = 0.5,
         verbose: bool = True,
     ):
         """Load a llama.cpp model from `model_path`.