Skip to content

feat(cli): add --include-gitignored flag to exclude files listed in .gitignore #253

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Jun 25, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ repos:
starlette>=0.40.0,
tiktoken,
tomli,
pathspec,
uvicorn>=0.11.7,
]
- id: pylint
Expand All @@ -124,6 +125,7 @@ repos:
starlette>=0.40.0,
tiktoken,
tomli,
pathspec,
uvicorn>=0.11.7,
]

Expand Down
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,9 @@ export GITHUB_TOKEN=github_pat_...
gitingest https://github.com/username/private-repo
```

By default, files listed in `.gitignore` are skipped. Use `--include-gitignored` if you
need those files in the digest.

By default, the digest is written to a text file (`digest.txt`) in your current working directory. You can customize the output in two ways:

- Use `--output/-o <filename>` to write to a specific file.
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ dependencies = [
"starlette>=0.40.0", # Vulnerable to https://osv.dev/vulnerability/GHSA-f96h-pmfr-66vw
"tiktoken>=0.7.0", # Support for o200k_base encoding
"tomli",
"pathspec>=0.12.1",
"typing_extensions; python_version < '3.10'",
"uvicorn>=0.11.7", # Vulnerable to https://osv.dev/vulnerability/PYSEC-2020-150
]
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
click>=8.0.0
fastapi[standard]>=0.109.1 # Vulnerable to https://osv.dev/vulnerability/PYSEC-2024-38
pathspec>=0.12.1
pydantic
python-dotenv
slowapi
Expand Down
15 changes: 14 additions & 1 deletion src/gitingest/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,12 @@
),
)
@click.option("--branch", "-b", default=None, help="Branch to clone and ingest")
@click.option(
"--include-gitignored",
is_flag=True,
default=False,
help="Include files matched by .gitignore",
)
@click.option(
"--token",
"-t",
Expand All @@ -61,6 +67,7 @@ def main(
exclude_pattern: Tuple[str, ...],
include_pattern: Tuple[str, ...],
branch: Optional[str],
include_gitignored: bool,
token: Optional[str],
):
"""
Expand All @@ -83,11 +90,12 @@ def main(
Glob patterns for including files in the output.
branch : str, optional
Specific branch to ingest (defaults to the repository's default).
include_gitignored : bool
If provided, include files normally ignored by .gitignore.
token: str, optional
GitHub personal-access token (PAT). Needed when *source* refers to a
**private** repository. Can also be set via the ``GITHUB_TOKEN`` env var.
"""

asyncio.run(
_async_main(
source=source,
Expand All @@ -96,6 +104,7 @@ def main(
exclude_pattern=exclude_pattern,
include_pattern=include_pattern,
branch=branch,
include_gitignored=include_gitignored,
token=token,
)
)
Expand All @@ -108,6 +117,7 @@ async def _async_main(
exclude_pattern: Tuple[str, ...],
include_pattern: Tuple[str, ...],
branch: Optional[str],
include_gitignored: bool,
token: Optional[str],
) -> None:
"""
Expand All @@ -132,6 +142,8 @@ async def _async_main(
Glob patterns for including files in the output.
branch : str, optional
Specific branch to ingest (defaults to the repository's default).
include_gitignored : bool
If provided, include files normally ignored by .gitignore.
token: str, optional
GitHub personal-access token (PAT). Needed when *source* refers to a
**private** repository. Can also be set via the ``GITHUB_TOKEN`` env var.
Expand Down Expand Up @@ -160,6 +172,7 @@ async def _async_main(
exclude_patterns=exclude_patterns,
branch=branch,
output=output_target,
include_gitignored=include_gitignored,
token=token,
)

Expand Down
12 changes: 12 additions & 0 deletions src/gitingest/entrypoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from gitingest.config import TMP_BASE_PATH
from gitingest.ingestion import ingest_query
from gitingest.query_parsing import IngestionQuery, parse_query
from gitingest.utils.ignore_patterns import load_gitignore_patterns


async def ingest_async(
Expand All @@ -19,6 +20,7 @@ async def ingest_async(
include_patterns: Optional[Union[str, Set[str]]] = None,
exclude_patterns: Optional[Union[str, Set[str]]] = None,
branch: Optional[str] = None,
include_gitignored: bool = False,
token: Optional[str] = None,
output: Optional[str] = None,
) -> Tuple[str, str, str]:
Expand All @@ -42,6 +44,8 @@ async def ingest_async(
Pattern or set of patterns specifying which files to exclude. If `None`, no files are excluded.
branch : str, optional
The branch to clone and ingest. If `None`, the default branch is used.
include_gitignored : bool
If ``True``, include files ignored by ``.gitignore``. Defaults to ``False``.
token : str, optional
GitHub personal-access token (PAT). Needed when *source* refers to a
**private** repository. Can also be set via the ``GITHUB_TOKEN`` env var.
Expand Down Expand Up @@ -76,6 +80,10 @@ async def ingest_async(
token=token,
)

if not include_gitignored:
gitignore_patterns = load_gitignore_patterns(query.local_path)
query.ignore_patterns.update(gitignore_patterns)

if query.url:
selected_branch = branch if branch else query.branch # prioritize branch argument
query.branch = selected_branch
Expand Down Expand Up @@ -117,6 +125,7 @@ def ingest(
include_patterns: Optional[Union[str, Set[str]]] = None,
exclude_patterns: Optional[Union[str, Set[str]]] = None,
branch: Optional[str] = None,
include_gitignored: bool = False,
token: Optional[str] = None,
output: Optional[str] = None,
) -> Tuple[str, str, str]:
Expand All @@ -140,6 +149,8 @@ def ingest(
Pattern or set of patterns specifying which files to exclude. If `None`, no files are excluded.
branch : str, optional
The branch to clone and ingest. If `None`, the default branch is used.
include_gitignored : bool
If ``True``, include files ignored by ``.gitignore``. Defaults to ``False``.
token : str, optional
GitHub personal-access token (PAT). Needed when *source* refers to a
**private** repository. Can also be set via the ``GITHUB_TOKEN`` env var.
Expand All @@ -165,6 +176,7 @@ def ingest(
include_patterns=include_patterns,
exclude_patterns=exclude_patterns,
branch=branch,
include_gitignored=include_gitignored,
token=token,
output=output,
)
Expand Down
46 changes: 46 additions & 0 deletions src/gitingest/utils/ignore_patterns.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
"""Default ignore patterns for Gitingest."""

import os
from pathlib import Path
from typing import Set

DEFAULT_IGNORE_PATTERNS: Set[str] = {
Expand Down Expand Up @@ -160,3 +162,47 @@
# Gitingest
"digest.txt",
}


def load_gitignore_patterns(root: Path) -> Set[str]:
"""
Recursively load ignore patterns from all .gitignore files under the given root directory.

Parameters
----------
root : Path
The root directory to search for .gitignore files.

Returns
-------
Set[str]
A set of ignore patterns extracted from all .gitignore files found under the root directory.
"""
patterns: Set[str] = set()
for dirpath, _, filenames in os.walk(root):
if ".gitignore" not in filenames:
continue

gitignore_path = Path(dirpath) / ".gitignore"
with gitignore_path.open("r", encoding="utf-8") as f:
for line in f:
stripped = line.strip()

if not stripped or stripped.startswith("#"):
continue

negated = stripped.startswith("!")
if negated:
stripped = stripped[1:]

rel_dir = os.path.relpath(dirpath, root)
if stripped.startswith("/"):
pattern_body = os.path.join(rel_dir, stripped.lstrip("/"))
else:
pattern_body = os.path.join(rel_dir, stripped) if rel_dir != "." else stripped

pattern_body = pattern_body.replace("\\", "/")
pattern = f"!{pattern_body}" if negated else pattern_body
patterns.add(pattern)

return patterns
15 changes: 6 additions & 9 deletions src/gitingest/utils/ingestion_utils.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
"""Utility functions for the ingestion process."""

from fnmatch import fnmatch
from pathlib import Path
from typing import Set

from pathspec import PathSpec


def _should_include(path: Path, base_path: Path, include_patterns: Set[str]) -> bool:
"""
Expand Down Expand Up @@ -38,10 +39,8 @@ def _should_include(path: Path, base_path: Path, include_patterns: Set[str]) ->
if path.is_dir():
return True

for pattern in include_patterns:
if fnmatch(rel_str, pattern):
return True
return False
spec = PathSpec.from_lines("gitwildmatch", include_patterns)
return spec.match_file(rel_str)


def _should_exclude(path: Path, base_path: Path, ignore_patterns: Set[str]) -> bool:
Expand Down Expand Up @@ -73,7 +72,5 @@ def _should_exclude(path: Path, base_path: Path, ignore_patterns: Set[str]) -> b
return True

rel_str = str(rel_path)
for pattern in ignore_patterns:
if pattern and fnmatch(rel_str, pattern):
return True
return False
spec = PathSpec.from_lines("gitwildmatch", ignore_patterns)
return spec.match_file(rel_str)
73 changes: 73 additions & 0 deletions tests/test_gitignore_feature.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
"""
Tests for the gitignore functionality in Gitingest.
"""

from pathlib import Path

import pytest

from gitingest.entrypoint import ingest_async
from gitingest.utils.ignore_patterns import load_gitignore_patterns


@pytest.fixture(name="repo_path")
def repo_fixture(tmp_path: Path) -> Path:
"""
Create a temporary repository structure with:
- A .gitignore that excludes 'exclude.txt'
- 'include.txt' (should be processed)
- 'exclude.txt' (should be skipped when gitignore rules are respected)
"""
# Create a .gitignore file that excludes 'exclude.txt'
gitignore_file = tmp_path / ".gitignore"
gitignore_file.write_text("exclude.txt\n")

# Create a file that should be included
include_file = tmp_path / "include.txt"
include_file.write_text("This file should be included.")

# Create a file that should be excluded
exclude_file = tmp_path / "exclude.txt"
exclude_file.write_text("This file should be excluded.")

return tmp_path


def test_load_gitignore_patterns(tmp_path: Path):
"""
Test that load_gitignore_patterns() correctly loads patterns from a .gitignore file.
"""
gitignore = tmp_path / ".gitignore"
# Write some sample patterns with a comment line included
gitignore.write_text("exclude.txt\n*.log\n# a comment\n")

patterns = load_gitignore_patterns(tmp_path)

# Check that the expected patterns are loaded
assert "exclude.txt" in patterns
assert "*.log" in patterns
# Ensure that comment lines are not added
for pattern in patterns:
assert not pattern.startswith("#")


@pytest.mark.asyncio
async def test_ingest_with_gitignore(repo_path: Path):
"""
Integration test for ingest_async() respecting .gitignore rules.

When ``include_gitignored`` is ``False`` (default), the content of 'exclude.txt' should be omitted.
When ``include_gitignored`` is ``True``, both files should be present.
"""
# Run ingestion with the gitignore functionality enabled.
_, _, content_with_ignore = await ingest_async(source=str(repo_path))
# 'exclude.txt' should be skipped.
assert "This file should be excluded." not in content_with_ignore
# 'include.txt' should be processed.
assert "This file should be included." in content_with_ignore

# Run ingestion with the gitignore functionality disabled.
_, _, content_without_ignore = await ingest_async(source=str(repo_path), include_gitignored=True)
# Now both files should be present.
assert "This file should be excluded." in content_without_ignore
assert "This file should be included." in content_without_ignore
14 changes: 7 additions & 7 deletions tests/test_ingestion.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,10 +84,10 @@ class PatternScenario(TypedDict):
"*/file_dir2.txt",
},
"ignore_patterns": {*()},
"expected_num_files": 3,
"expected_content": {"file1.txt", "file2.py", "dir2/file_dir2.txt"},
"expected_structure": {"test_repo/", "dir2/"},
"expected_not_structure": {"src/", "subdir/", "dir1/"},
"expected_num_files": 4,
"expected_content": {"file1.txt", "file2.py", "dir1/file_dir1.txt", "dir2/file_dir2.txt"},
"expected_structure": {"test_repo/", "dir1/", "dir2/"},
"expected_not_structure": {"src/", "subdir/"},
}
),
id="include-wildcard-directory",
Expand All @@ -114,9 +114,10 @@ class PatternScenario(TypedDict):
{
"include_patterns": {"**/file_dir2.txt", "src/**/*.py"},
"ignore_patterns": {*()},
"expected_num_files": 2,
"expected_num_files": 3,
"expected_content": {
"dir2/file_dir2.txt",
"src/subfile2.py",
"src/subdir/file_subdir.py",
},
"expected_structure": {"test_repo/", "dir2/", "src/", "subdir/"},
Expand Down Expand Up @@ -169,12 +170,11 @@ class PatternScenario(TypedDict):
{
"include_patterns": {*()},
"ignore_patterns": {"src/**/*.py"},
"expected_num_files": 7,
"expected_num_files": 6,
"expected_content": {
"file1.txt",
"file2.py",
"src/subfile1.txt",
"src/subfile2.py",
"src/subdir/file_subdir.txt",
"dir1/file_dir1.txt",
"dir2/file_dir2.txt",
Expand Down
Loading