Skip to content

fix: make cache aware of subpaths #481

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jul 31, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ x-prod-environment: &prod-environment

x-dev-environment: &dev-environment
DEBUG: "true"
LOG_LEVEL: "debug"
LOG_LEVEL: "DEBUG"
RELOAD: "true"
GITINGEST_SENTRY_ENVIRONMENT: ${GITINGEST_SENTRY_ENVIRONMENT:-development}
# S3 Configuration for development
Expand Down
2 changes: 2 additions & 0 deletions src/server/query_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ async def _check_s3_cache(
user_name=cast("str", query.user_name),
repo_name=cast("str", query.repo_name),
commit=query.commit,
subpath=query.subpath,
include_patterns=query.include_patterns,
ignore_patterns=query.ignore_patterns,
)
Expand Down Expand Up @@ -168,6 +169,7 @@ def _store_digest_content(
user_name=cast("str", query.user_name),
repo_name=cast("str", query.repo_name),
commit=query.commit,
subpath=query.subpath,
include_patterns=query.include_patterns,
ignore_patterns=query.ignore_patterns,
)
Expand Down
10 changes: 7 additions & 3 deletions src/server/s3_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,14 +62,15 @@ def generate_s3_file_path(
user_name: str,
repo_name: str,
commit: str,
subpath: str,
include_patterns: set[str] | None,
ignore_patterns: set[str],
) -> str:
"""Generate S3 file path with proper naming convention.

The file path is formatted as:
[<S3_DIRECTORY_PREFIX>/]ingest/<provider>/<repo-owner>/<repo-name>/<branch>/<commit-ID>/
<exclude&include hash>/<owner>-<repo-name>.txt
<exclude&include hash>/<owner>-<repo-name>-<subpath-hash>.txt

If S3_DIRECTORY_PREFIX environment variable is set, it will be prefixed to the path.
The commit-ID is always included in the URL.
Expand All @@ -85,6 +86,8 @@ def generate_s3_file_path(
Repository name.
commit : str
Commit hash.
subpath : str
Subpath of the repository.
include_patterns : set[str] | None
Set of patterns specifying which files to include.
ignore_patterns : set[str]
Expand All @@ -111,9 +114,10 @@ def generate_s3_file_path(
patterns_str = f"include:{sorted(include_patterns) if include_patterns else []}"
patterns_str += f"exclude:{sorted(ignore_patterns)}"
patterns_hash = hashlib.sha256(patterns_str.encode()).hexdigest()[:16]
subpath_hash = hashlib.sha256(subpath.encode()).hexdigest()[:16]

# Build the base path using hostname directly
base_path = f"ingest/{hostname}/{user_name}/{repo_name}/{commit}/{patterns_hash}/{user_name}-{repo_name}.txt"
file_name = f"{user_name}-{repo_name}-{subpath_hash}.txt"
base_path = f"ingest/{hostname}/{user_name}/{repo_name}/{commit}/{patterns_hash}/{file_name}"

# Check for S3_DIRECTORY_PREFIX environment variable
s3_directory_prefix = os.getenv("S3_DIRECTORY_PREFIX")
Expand Down
Loading