diff --git a/.github/workflows/deploy-demo.yml b/.github/workflows/deploy-demo.yml index d38d8a1..c9440a6 100644 --- a/.github/workflows/deploy-demo.yml +++ b/.github/workflows/deploy-demo.yml @@ -2,29 +2,21 @@ name: Build and deploy demo on: workflow_dispatch: - push: - branches: - - main - schedule: - - cron: '0 0 * * *' +# schedule: +# - cron: '0 0 * * *' jobs: scheduled: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 name: Check out repo - name: Set up Python - uses: actions/setup-python@v1 + uses: actions/setup-python@v4 with: - python-version: 3.8 - - uses: actions/cache@v1 - name: Configure pip caching - with: - path: ~/.cache/pip - key: ${{ runner.os }}-pip-${{ hashFiles('**/setup.py') }} - restore-keys: | - ${{ runner.os }}-pip- + python-version: "3.12" + cache: pip + cache-dependency-path: setup.py - name: Install Python dependencies run: | python -m pip install --upgrade pip @@ -35,7 +27,7 @@ jobs: pip install bs4 - name: Create auth.json env: - GITHUB_ACCESS_TOKEN: ${{ secrets.GITHUB_ACCESS_TOKEN }} + GITHUB_ACCESS_TOKEN: ${{ secrets.GH_TOKEN }} run: | echo "{\"github_personal_token\": \"$GITHUB_ACCESS_TOKEN\"}" > auth.json - name: Fetch previous copy of database @@ -91,13 +83,13 @@ jobs: sqlite-utils rebuild-fts github.db # Populate _analyze_tables_ table sqlite-utils analyze-tables github.db --save - - uses: actions/upload-artifact@v2 + - uses: actions/upload-artifact@v3 with: path: github.db - name: Set up Cloud Run - uses: GoogleCloudPlatform/github-actions/setup-gcloud@master + uses: google-github-actions/setup-gcloud@v0 with: - version: '275.0.0' + version: '318.0.0' service_account_email: ${{ secrets.GCP_SA_EMAIL }} service_account_key: ${{ secrets.GCP_SA_KEY }} - name: Deploy to Cloud Run @@ -107,12 +99,11 @@ jobs: datasette publish cloudrun github.db \ -m demo-metadata.json \ --service github-to-sqlite \ - --branch=main \ - --install=py-gfm \ --install=datasette-search-all>=0.3 \ --install=datasette-render-markdown>=1.1.2 \ - --install=datasette-pretty-json \ + --install=datasette-pretty-json>=0.2.2 \ --install=datasette-json-html \ --install=datasette-vega \ --install=datasette-render-images \ - --install=datasette-graphql + --install=datasette-graphql \ + --install=datasette-atom diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 3755c3a..c28f0fe 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -9,20 +9,15 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.6, 3.7, 3.8, 3.9] + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - - uses: actions/cache@v2 - name: Configure pip caching - with: - path: ~/.cache/pip - key: ${{ runner.os }}-pip-${{ hashFiles('**/setup.py') }} - restore-keys: | - ${{ runner.os }}-pip- + cache: pip + cache-dependency-path: setup.py - name: Install dependencies run: | pip install -e '.[test]' @@ -33,18 +28,13 @@ jobs: runs-on: ubuntu-latest needs: [test] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v2 - with: - python-version: '3.9' - - uses: actions/cache@v2 - name: Configure pip caching + uses: actions/setup-python@v4 with: - path: ~/.cache/pip - key: ${{ runner.os }}-publish-pip-${{ hashFiles('**/setup.py') }} - restore-keys: | - ${{ runner.os }}-publish-pip- + python-version: "3.12" + cache: pip + cache-dependency-path: setup.py - name: Install dependencies run: | pip install setuptools wheel twine diff --git a/.github/workflows/readme-toc.yaml b/.github/workflows/readme-toc.yaml index 39c9028..3e81dd8 100644 --- a/.github/workflows/readme-toc.yaml +++ b/.github/workflows/readme-toc.yaml @@ -13,7 +13,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Check out repo - uses: actions/checkout@v2 + uses: actions/checkout@v4 - name: Update TOC run: npx markdown-toc README.md -i - name: Commit and push if README changed diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index a177421..c49fa4c 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -7,20 +7,15 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.6, 3.7, 3.8, 3.9] + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - - uses: actions/cache@v2 - name: Configure pip caching - with: - path: ~/.cache/pip - key: ${{ runner.os }}-pip-${{ hashFiles('**/setup.py') }} - restore-keys: | - ${{ runner.os }}-pip- + cache: pip + cache-dependency-path: setup.py - name: Install dependencies run: | pip install -e '.[test]' diff --git a/.gitignore b/.gitignore index 27b93de..d9e1f4d 100644 --- a/.gitignore +++ b/.gitignore @@ -8,4 +8,5 @@ venv .eggs .pytest_cache *.egg-info - +.coverage +build/ diff --git a/README.md b/README.md index d4c1f5b..a45bfc0 100644 --- a/README.md +++ b/README.md @@ -82,13 +82,25 @@ You can use the `--pull-request` option one or more times to load specific pull Note that the `merged_by` column on the `pull_requests` table will only be populated for pull requests that are loaded using the `--pull-request` option - the GitHub API does not return this field for pull requests that are loaded in bulk. +You can load only pull requests in a certain state with the `--state` option: + + $ github-to-sqlite pull-requests --state=open github.db simonw/datasette + +Pull requests across an entire organization (or more than one) can be loaded with `--org`: + + $ github-to-sqlite pull-requests --state=open --org=psf --org=python github.db + +You can use a search query to find pull requests. Note that no more than 1000 will be loaded (this is a GitHub API limitation), and some data will be missing (base and head SHAs). When using searches, other filters are ignored; put all criteria into the search itself: + + $ github-to-sqlite pull-requests --search='org:python defaultdict state:closed created:<2023-09-01' github.db + Example: [pull_requests table](https://github-to-sqlite.dogsheep.net/github/pull_requests) ## Fetching issue comments for a repository The `issue-comments` command retrieves all of the comments on all of the issues in a repository. -It is recommended you run `issues` first, so that each imported comment can have a foreign key poining to its issue. +It is recommended you run `issues` first, so that each imported comment can have a foreign key pointing to its issue. $ github-to-sqlite issues github.db simonw/datasette $ github-to-sqlite issue-comments github.db simonw/datasette @@ -101,7 +113,7 @@ Example: [issue_comments table](https://github-to-sqlite.dogsheep.net/github/iss ## Fetching commits for a repository -The `commits` command retrieves details of all of the commits for one or more repositories. It currently fetches the sha, commit message and author and committer details - it does no retrieve the full commit body. +The `commits` command retrieves details of all of the commits for one or more repositories. It currently fetches the SHA, commit message and author and committer details; it does not retrieve the full commit body. $ github-to-sqlite commits github.db simonw/datasette simonw/sqlite-utils @@ -156,7 +168,7 @@ You can pass more than one username to fetch for multiple users or organizations $ github-to-sqlite repos github.db simonw dogsheep -Add the `--readme` option to save the README for the repo in a column called `readme`. Add `--readme-html` to save the HTML rendered version of the README into a collumn called `readme_html`. +Add the `--readme` option to save the README for the repo in a column called `readme`. Add `--readme-html` to save the HTML rendered version of the README into a column called `readme_html`. Example: [repos table](https://github-to-sqlite.dogsheep.net/github/repos) @@ -208,7 +220,7 @@ The command accepts one or more repositories. Add `-v` for verbose output. -Example: [dependents table](https://github-to-sqlite.dogsheep.net/github/dependents) +Example: [dependents table](https://github-to-sqlite.dogsheep.net/github/dependents?_sort_desc=first_seen_utc) ## Fetching emojis @@ -216,7 +228,7 @@ You can fetch a list of every emoji supported by GitHub using the `emojis` comma $ github-to-sqlite emojis github.db -This will create a table callad `emojis` with a primary key `name` and a `url` column. +This will create a table called `emojis` with a primary key `name` and a `url` column. If you add the `--fetch` option the command will also fetch the binary content of the images and place them in an `image` column: @@ -235,7 +247,7 @@ The `github-to-sqlite get` command provides a convenient shortcut for making aut This will make an authenticated call to the URL you provide and pretty-print the resulting JSON to the console. -You can ommit the `https://api.github.com/` prefix, for example: +You can omit the `https://api.github.com/` prefix, for example: $ github-to-sqlite get /gists diff --git a/demo-metadata.json b/demo-metadata.json index 293c947..c04aa5a 100644 --- a/demo-metadata.json +++ b/demo-metadata.json @@ -45,10 +45,7 @@ "span": [ "class" ] - }, - "extensions": [ - "mdx_gfm:GithubFlavoredMarkdownExtension" - ] + } } } }, @@ -89,10 +86,7 @@ "span": [ "class" ] - }, - "extensions": [ - "mdx_gfm:GithubFlavoredMarkdownExtension" - ] + } } } }, @@ -138,10 +132,7 @@ "span": [ "class" ] - }, - "extensions": [ - "mdx_gfm:GithubFlavoredMarkdownExtension" - ] + } } } }, @@ -180,10 +171,7 @@ "span": [ "class" ] - }, - "extensions": [ - "mdx_gfm:GithubFlavoredMarkdownExtension" - ] + } } } } diff --git a/github_to_sqlite/cli.py b/github_to_sqlite/cli.py index 8609db3..e6a2d88 100644 --- a/github_to_sqlite/cli.py +++ b/github_to_sqlite/cli.py @@ -1,5 +1,6 @@ import click import datetime +import itertools import pathlib import textwrap import os @@ -104,19 +105,53 @@ def issues(db_path, repo, issue_ids, auth, load): type=click.Path(file_okay=True, dir_okay=False, allow_dash=True, exists=True), help="Load pull-requests JSON from this file instead of the API", ) -def pull_requests(db_path, repo, pull_request_ids, auth, load): +@click.option( + "--org", + "orgs", + help="Fetch all pull requests from this GitHub organization", + multiple=True, +) +@click.option( + "--state", + help="Only fetch pull requests in this state", +) +@click.option( + "--search", + help="Find pull requests with a search query", +) +def pull_requests(db_path, repo, pull_request_ids, auth, load, orgs, state, search): "Save pull_requests for a specified repository, e.g. simonw/datasette" db = sqlite_utils.Database(db_path) token = load_token(auth) - repo_full = utils.fetch_repo(repo, token) - utils.save_repo(db, repo_full) if load: + repo_full = utils.fetch_repo(repo, token) + utils.save_repo(db, repo_full) pull_requests = json.load(open(load)) + utils.save_pull_requests(db, pull_requests, repo_full) + elif search: + repos_seen = set() + search += " is:pr" + pull_requests = utils.fetch_searched_pulls_or_issues(search, token) + for pull_request in pull_requests: + pr_repo_url = pull_request["repository_url"] + if pr_repo_url not in repos_seen: + pr_repo = utils.fetch_repo(url=pr_repo_url) + utils.save_repo(db, pr_repo) + repos_seen.add(pr_repo_url) + utils.save_pull_requests(db, [pull_request], pr_repo) else: - pull_requests = utils.fetch_pull_requests(repo, token, pull_request_ids) - - pull_requests = list(pull_requests) - utils.save_pull_requests(db, pull_requests, repo_full) + if orgs: + repos = itertools.chain.from_iterable( + utils.fetch_all_repos(token=token, org=org) + for org in orgs + ) + else: + repos = [utils.fetch_repo(repo, token)] + for repo_full in repos: + utils.save_repo(db, repo_full) + repo = repo_full["full_name"] + pull_requests = utils.fetch_pull_requests(repo, state, token, pull_request_ids) + utils.save_pull_requests(db, pull_requests, repo_full) utils.ensure_db_shape(db) @@ -192,7 +227,7 @@ def starred(db_path, username, auth, load): @click.option( "-a", "--auth", - type=click.Path(file_okay=True, dir_okay=False, allow_dash=True, exists=True), + type=click.Path(file_okay=True, dir_okay=False, allow_dash=True), default="auth.json", help="Path to auth.json token file", ) @@ -244,7 +279,7 @@ def stargazers(db_path, repos, auth): help="Fetch HTML rendered README into 'readme_html' column", ) def repos(db_path, usernames, auth, repo, load, readme, readme_html): - "Save repos owened by the specified (or authenticated) username or organization" + "Save repos owned by the specified (or authenticated) username or organization" db = sqlite_utils.Database(db_path) token = load_token(auth) if load: @@ -539,7 +574,7 @@ def emojis(db_path, auth, fetch): help="Accept header to send, e.g. application/vnd.github.VERSION.html", ) def get(url, auth, paginate, nl, accept): - "Save repos owened by the specified (or authenticated) username or organization" + "Make an authenticated HTTP GET against the specified URL" token = load_token(auth) first = True should_output_closing_brace = not nl @@ -585,7 +620,7 @@ def get(url, auth, paginate, nl, accept): @click.option( "-a", "--auth", - type=click.Path(file_okay=True, dir_okay=False, allow_dash=True, exists=True), + type=click.Path(file_okay=True, dir_okay=False, allow_dash=True), default="auth.json", help="Path to auth.json token file", ) diff --git a/github_to_sqlite/utils.py b/github_to_sqlite/utils.py index bae4ac6..c837690 100644 --- a/github_to_sqlite/utils.py +++ b/github_to_sqlite/utils.py @@ -2,6 +2,7 @@ import requests import re import time +import urllib.parse import yaml FTS_CONFIG = { @@ -74,16 +75,17 @@ class GitHubError(Exception): - def __init__(self, message, status_code): + def __init__(self, message, status_code, headers=None): self.message = message self.status_code = status_code + self.headers = headers @classmethod def from_response(cls, response): message = response.json()["message"] if "git repository is empty" in message.lower(): cls = GitHubRepositoryEmpty - return cls(message, response.status_code) + return cls(message, response.status_code, response.headers) class GitHubRepositoryEmpty(GitHubError): @@ -169,8 +171,11 @@ def save_pull_requests(db, pull_requests, repo): # Add repo key pull_request["repo"] = repo["id"] # Pull request _links can be flattened to just their URL - pull_request["url"] = pull_request["_links"]["html"]["href"] - pull_request.pop("_links") + if "_links" in pull_request: + pull_request["url"] = pull_request["_links"]["html"]["href"] + pull_request.pop("_links") + else: + pull_request["url"] = pull_request["pull_request"]["html_url"] # Extract user pull_request["user"] = save_user(db, pull_request["user"]) labels = pull_request.pop("labels") @@ -178,8 +183,9 @@ def save_pull_requests(db, pull_requests, repo): if pull_request.get("merged_by"): pull_request["merged_by"] = save_user(db, pull_request["merged_by"]) # Head sha - pull_request["head"] = pull_request["head"]["sha"] - pull_request["base"] = pull_request["base"]["sha"] + if "head" in pull_request: + pull_request["head"] = pull_request["head"]["sha"] + pull_request["base"] = pull_request["base"]["sha"] # Extract milestone if pull_request["milestone"]: pull_request["milestone"] = save_milestone( @@ -223,6 +229,11 @@ def save_pull_requests(db, pull_requests, repo): def save_user(db, user): + # Under some conditions, GitHub caches removed repositories with + # stars and ends up leaving dangling `None` user references. + if user is None: + return None + # Remove all url fields except avatar_url and html_url to_save = { key: value @@ -286,12 +297,13 @@ def save_issue_comment(db, comment): return last_pk -def fetch_repo(full_name, token=None): +def fetch_repo(full_name=None, token=None, url=None): headers = make_headers(token) # Get topics: headers["Accept"] = "application/vnd.github.mercy-preview+json" - owner, slug = full_name.split("/") - url = "https://api.github.com/repos/{}/{}".format(owner, slug) + if url is None: + owner, slug = full_name.split("/") + url = "https://api.github.com/repos/{}/{}".format(owner, slug) response = requests.get(url, headers=headers) response.raise_for_status() return response.json() @@ -352,7 +364,7 @@ def fetch_issues(repo, token=None, issue_ids=None): yield from issues -def fetch_pull_requests(repo, token=None, pull_request_ids=None): +def fetch_pull_requests(repo, state=None, token=None, pull_request_ids=None): headers = make_headers(token) headers["accept"] = "application/vnd.github.v3+json" if pull_request_ids: @@ -364,11 +376,20 @@ def fetch_pull_requests(repo, token=None, pull_request_ids=None): response.raise_for_status() yield response.json() else: - url = "https://api.github.com/repos/{}/pulls?state=all&filter=all".format(repo) + state = state or "all" + url = f"https://api.github.com/repos/{repo}/pulls?state={state}" for pull_requests in paginate(url, headers): yield from pull_requests +def fetch_searched_pulls_or_issues(query, token=None): + headers = make_headers(token) + url = "https://api.github.com/search/issues?" + url += urllib.parse.urlencode({"q": query}) + for pulls_or_issues in paginate(url, headers): + yield from pulls_or_issues["items"] + + def fetch_issue_comments(repo, token=None, issue=None): assert "/" in repo headers = make_headers(token) @@ -439,13 +460,15 @@ def fetch_stargazers(repo, token=None): yield from stargazers -def fetch_all_repos(username=None, token=None): - assert username or token, "Must provide username= or token= or both" +def fetch_all_repos(username=None, token=None, org=None): + assert username or token or org, "Must provide username= or token= or org= or a combination" headers = make_headers(token) # Get topics for each repo: headers["Accept"] = "application/vnd.github.mercy-preview+json" if username: url = "https://api.github.com/users/{}/repos".format(username) + elif org: + url = "https://api.github.com/orgs/{}/repos".format(org) else: url = "https://api.github.com/user/repos" for repos in paginate(url, headers): @@ -463,6 +486,7 @@ def fetch_user(username=None, token=None): def paginate(url, headers=None): + url += ("&" if "?" in url else "?") + "per_page=100" while url: response = requests.get(url, headers=headers) # For HTTP 204 no-content this yields an empty list @@ -726,7 +750,7 @@ def scrape_dependents(repo, verbose=False): yield from repos # next page? try: - next_link = soup.select(".paginate-container")[0].find("a", text="Next") + next_link = soup.select(".paginate-container")[0].find("a", string="Next") except IndexError: break if next_link is not None: diff --git a/setup.py b/setup.py index d33ead4..de72b51 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from setuptools import setup import os -VERSION = "2.8.2" +VERSION = "2.9" def get_long_description():