From 0a1e6e9eb488be5e84afbb0743879ecc7e2e34d5 Mon Sep 17 00:00:00 2001 From: Amgad Hasan <109704569+AmgadHasan@users.noreply.github.com> Date: Fri, 11 Apr 2025 03:26:09 +0200 Subject: [PATCH 1/3] Update output_formatters.py to use gpt-4o's tokenizer --- src/gitingest/output_formatters.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gitingest/output_formatters.py b/src/gitingest/output_formatters.py index 5bacba22..ff737b79 100644 --- a/src/gitingest/output_formatters.py +++ b/src/gitingest/output_formatters.py @@ -171,7 +171,7 @@ def _format_token_count(text: str) -> Optional[str]: The formatted number of tokens as a string (e.g., '1.2k', '1.2M'), or `None` if an error occurs. """ try: - encoding = tiktoken.get_encoding("cl100k_base") + encoding = tiktoken.get_encoding("o200k_base") # gpt-4o, gpt-4o-mini total_tokens = len(encoding.encode(text, disallowed_special=())) except (ValueError, UnicodeEncodeError) as exc: print(exc) From f318080362bf0b249b5bdca048dc459c6ed2bc86 Mon Sep 17 00:00:00 2001 From: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com> Date: Wed, 18 Jun 2025 11:12:09 +0200 Subject: [PATCH 2/3] black fmt --- src/gitingest/output_formatters.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gitingest/output_formatters.py b/src/gitingest/output_formatters.py index ff737b79..9ca3d474 100644 --- a/src/gitingest/output_formatters.py +++ b/src/gitingest/output_formatters.py @@ -171,7 +171,7 @@ def _format_token_count(text: str) -> Optional[str]: The formatted number of tokens as a string (e.g., '1.2k', '1.2M'), or `None` if an error occurs. """ try: - encoding = tiktoken.get_encoding("o200k_base") # gpt-4o, gpt-4o-mini + encoding = tiktoken.get_encoding("o200k_base") # gpt-4o, gpt-4o-mini total_tokens = len(encoding.encode(text, disallowed_special=())) except (ValueError, UnicodeEncodeError) as exc: print(exc) From 1130b55b81eb850017214820966f416d7d4d4f03 Mon Sep 17 00:00:00 2001 From: Filip Christiansen <22807962+filipchristiansen@users.noreply.github.com> Date: Wed, 18 Jun 2025 11:48:01 +0200 Subject: [PATCH 3/3] =?UTF-8?q?feat:=20bump=20tiktoken=20to=20=E2=89=A50.7?= =?UTF-8?q?.0=20and=20drop=20Python=203.7=20support?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Context ------- The previous commit switched token counting from **cl100k_base** to **o200k_base**. That encoding is available only in **tiktoken ≥ 0.7.0**, whose wheels require Python 3.8+. We already exclude 3.7 from the CI matrix, so this raises the documented minimum Python version without affecting tested platforms. Changes ------- * **README.md** – “Python 3.7+” → **“Python 3.8+”** * **pyproject.toml** * `tiktoken` → `tiktoken>=0.7.0` # o200k_base support * Remove classifier *Programming Language :: Python :: 3.7* * **requirements.txt** – `tiktoken` → `tiktoken>=0.7.0` * No code changes beyond the earlier one-liner already in this PR. Impact ------ * **Breaking** for users pinned to Python 3.7; they must upgrade to ≥ 3.8. * No new runtime dependencies—just a higher floor for *tiktoken* and Python. --- README.md | 2 +- pyproject.toml | 3 +-- requirements.txt | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index ba69b0a9..9ed8318b 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ You can also replace `hub` with `ingest` in any GitHub URL to access the corresp ## 📚 Requirements -- Python 3.7+ +- Python 3.8+ - For private repositories: A GitHub Personal Access Token (PAT). You can generate one at [https://github.com/settings/personal-access-tokens](https://github.com/settings/personal-access-tokens) (Profile → Settings → Developer Settings → Personal Access Tokens → Fine-grained Tokens) ### 📦 Installation diff --git a/pyproject.toml b/pyproject.toml index f280d4a4..f6d39290 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,7 @@ dependencies = [ "python-dotenv", "slowapi", "starlette>=0.40.0", # Vulnerable to https://osv.dev/vulnerability/GHSA-f96h-pmfr-66vw - "tiktoken", + "tiktoken>=0.7.0", # Support for o200k_base encoding "tomli", "typing_extensions; python_version < '3.10'", "uvicorn>=0.11.7", # Vulnerable to https://osv.dev/vulnerability/PYSEC-2020-150 @@ -23,7 +23,6 @@ classifiers=[ "Development Status :: 3 - Alpha", "Intended Audience :: Developers", "License :: OSI Approved :: MIT License", - "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", diff --git a/requirements.txt b/requirements.txt index 5f8657ed..aa8ff03b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,6 +4,6 @@ pydantic python-dotenv slowapi starlette>=0.40.0 # Vulnerable to https://osv.dev/vulnerability/GHSA-f96h-pmfr-66vw -tiktoken +tiktoken>=0.7.0 # Support for o200k_base encoding tomli uvicorn>=0.11.7 # Vulnerable to https://osv.dev/vulnerability/PYSEC-2020-150