diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 16d03db..8036bea 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -7,7 +7,7 @@ "features": { "ghcr.io/devcontainers/features/python:1": { "installTools": true, - "version": "3.12.1" + "version": "3.13.2" } }, "customizations": { @@ -15,7 +15,8 @@ "extensions": [ "streetsidesoftware.code-spell-checker", "ms-python.python", - "redhat.vscode-yaml" + "redhat.vscode-yaml", + "GitHub.copilot" ] } } diff --git a/.dockerignore b/.dockerignore index 21d444f..7c04879 100644 --- a/.dockerignore +++ b/.dockerignore @@ -162,6 +162,8 @@ cython_debug/ ## Local local/ +.vscode/ +.github/ ## test outputs bkps/ \ No newline at end of file diff --git a/.github/actions/docker/action.yml b/.github/actions/docker/action.yml new file mode 100644 index 0000000..3fa3194 --- /dev/null +++ b/.github/actions/docker/action.yml @@ -0,0 +1,84 @@ +--- +name: Docker image +description: Creates a Docker image + +# note inputs have some weird behavior: https://github.com/actions/runner/issues/1483 +# for string type, don't wrap in JSON +# for boolean type, wrap in JSON + +inputs: + dockerhub_username: + description: Dockerhub username + type: string + required: false + default: none + dockerhub_token: + description: Dockerhub token + type: string + required: false + default: none + push: + description: Push Images to docker hub + type: boolean + required: false + default: true + latest: + description: Update latest tag + type: boolean + required: false + default: true + +runs: + using: composite + steps: + - name: Docker meta + id: meta + uses: docker/metadata-action@v5 + with: + images: docker.io/${{ github.repository }} + flavor: | + latest=${{ fromJSON(inputs.latest) }} + # for some reason can't get this to show up from docker image labels + # placing here for now + labels: | + org.opencontainers.image.description=Page asset and content exporter for Bookstack + tags: | + ## add the event types that should be added as tags + ## on merge to master - update `main` tag for testing before release + type=ref,event=branch + ## on release - for use by users + ## version ; shorthand for {{major}}.{{minor}}.{{patch}} (can include pre-release) + type=semver,pattern={{ version }} + + - name: Set up QEMU + uses: docker/setup-qemu-action@v3 + + - name: Set up Docker Buildx + id: buildx + uses: docker/setup-buildx-action@v3 + + - name: Login to Dockerhub + if: github.event_name != 'pull_request' + uses: docker/login-action@v3 + with: + username: ${{ inputs.dockerhub_username }} + password: ${{ inputs.dockerhub_token }} + + - name: Build Docker Image + if: github.event_name != 'pull_request' + uses: docker/build-push-action@v5 + with: + context: . + file: ./Dockerfile + platforms: linux/amd64,linux/arm64 + push: ${{ fromJSON(inputs.push) }} + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + + - name: Update Dockerhub Documentation + uses: peter-evans/dockerhub-description@v3 + if: ${{ (fromJSON(inputs.latest) == true) && (github.event_name != 'pull_request') }} + with: + username: ${{ inputs.dockerhub_username }} + password: ${{ inputs.dockerhub_token }} + repository: ${{ github.repository }} \ No newline at end of file diff --git a/.github/actions/python/action.yml b/.github/actions/python/action.yml new file mode 100644 index 0000000..774aa44 --- /dev/null +++ b/.github/actions/python/action.yml @@ -0,0 +1,41 @@ +--- +name: Deploy to PyPi +description: Deploys the python package to PyPi + +inputs: + pypi_api_token: + description: PyPi api token + type: string + required: true + +runs: + using: composite + steps: + - name: Get tag release without v + shell: bash + run: | + TAG=${{ github.ref_name }} + echo "VERSION=${TAG#v}" >> "$GITHUB_ENV" + echo "Tag without v is: ${VERSION}" + - name: Update Release Tag + shell: bash + run: sed -i "s/^version = [^ ]*/version = ${{ env.VERSION }}/" setup.cfg + - name: Set up Python + uses: actions/setup-python@v3 + with: + python-version: '3.13.2' + - name: Install Dependencies + shell: bash + run: | + python -m pip install --upgrade pip + pip install build + - name: Build Python Package + shell: bash + run: | + python -m pip install --upgrade build + python -m build + - name: Publish to PyPi + uses: pypa/gh-action-pypi-publish@release/v1 + with: + password: ${{ inputs.pypi_api_token }} + skip-existing: true \ No newline at end of file diff --git a/.github/actions/tests/action.yml b/.github/actions/tests/action.yml new file mode 100644 index 0000000..d267fca --- /dev/null +++ b/.github/actions/tests/action.yml @@ -0,0 +1,20 @@ +--- +name: Test Python Package +description: Test and lint code + +runs: + using: composite + steps: + - name: Set up Python + uses: actions/setup-python@v3 + with: + python-version: '3.13.2' + - name: Install dependencies + shell: bash + run: | + python -m pip install --upgrade pip + pip install pylint + - name: Analysing the code with pylint + shell: bash + run: | + pylint $(git ls-files '*.py') \ No newline at end of file diff --git a/.github/workflows/always.pylint.yml b/.github/bkp/always.pylint.yml similarity index 100% rename from .github/workflows/always.pylint.yml rename to .github/bkp/always.pylint.yml diff --git a/.github/bkp/on_pr_merged.yml b/.github/bkp/on_pr_merged.yml new file mode 100644 index 0000000..04552fb --- /dev/null +++ b/.github/bkp/on_pr_merged.yml @@ -0,0 +1,121 @@ +# needs: [tests] # require tests to pass before deploy runs + +name: Build and Push + +# on: +# push: +# # Pattern matched against refs/tags +# tags: +# - '**' # Push events to every tag including hierarchical tags like v1.0/beta + +on: + pull_request: + types: + - closed + branches: + - main + +# maybe trigger build/push on release tags? +# but this also works for my use case +jobs: + docker_deploy: + if: github.event.pull_request.merged + runs-on: ubuntu-latest + # specify this to target the correct env/secrets to use + environment: 'Dockerhub' + steps: + - uses: actions/checkout@v3 + - name: Login to Dockerhub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + - name: Set up QEMU + uses: docker/setup-qemu-action@v3 + - name: Set up Docker Buildx + id: buildx + uses: docker/setup-buildx-action@v3 + - name: Build and push the Docker image + run: make docker_build_latest + # - name: Push Docker image + # run: make docker_push_latest + - name: Update Dockerhub Documentation + uses: peter-evans/dockerhub-description@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + repository: ${{ env.DOCKERHUB_REPO }} + pypi_deploy: + if: github.event.pull_request.merged + runs-on: ubuntu-latest + environment: 'PyPi' + steps: + - uses: actions/checkout@v3 + - name: Set up Python + uses: actions/setup-python@v3 + with: + python-version: '3.x' + - name: Set tag version + run: | + TAG=$(cat Makefile | grep -E ^IMAGE_TAG=[0-9].[0-9].[0-9] | cut -d "=" -f2) + echo "VERSION=${TAG}" >> "$GITHUB_ENV" + echo "version from Makefile is: ${VERSION}" + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install build + - name: Update Release tag + run: sed -i "s/^version = [^ ]*/version = ${{ env.VERSION }}/" setup.cfg + - name: Build package + run: make build + - name: Publish package + uses: pypa/gh-action-pypi-publish@release/v1 + with: + password: ${{ secrets.PYPI_API_TOKEN }} + skip-existing: true + create_tag: + if: github.event.pull_request.merged + runs-on: ubuntu-latest + needs: + - docker_deploy + - pypi_deploy + permissions: + contents: write + steps: + - uses: actions/checkout@v3 + with: + ref: ${{ github.event.pull_request.merge_commit_sha }} + fetch-depth: '0' + - name: Set tag version + run: | + TAG=$(cat Makefile | grep -E ^IMAGE_TAG=[0-9].[0-9].[0-9] | cut -d "=" -f2) + echo "VERSION=v${TAG}" >> "$GITHUB_ENV" + echo "version from Makefile is: ${VERSION}" + - name: Create tag + uses: anothrNick/github-tag-action@1.64.0 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # WITH_V: true + # PRERELEASE: true + CUSTOM_TAG: ${{ env.VERSION }} + create_release: + if: github.event.pull_request.merged + runs-on: ubuntu-latest + needs: + - create_tag + permissions: + contents: write + steps: + - uses: actions/checkout@v3 + - name: Set tag version + run: | + TAG=$(cat Makefile | grep -E ^IMAGE_TAG=[0-9].[0-9].[0-9] | cut -d "=" -f2) + echo "VERSION=v${TAG}" >> "$GITHUB_ENV" + echo "version from Makefile is: ${VERSION}" + - name: Generate release + uses: ncipollo/release-action@v1 + with: + tag: ${{ env.VERSION }} + generateReleaseNotes: true + skipIfReleaseExists: true + # docker image tag latest diff --git a/.github/workflows/on_pr_open.docker-build.yml b/.github/bkp/on_pr_open.docker-build.yml similarity index 84% rename from .github/workflows/on_pr_open.docker-build.yml rename to .github/bkp/on_pr_open.docker-build.yml index 0f91b2e..73a9c1b 100644 --- a/.github/workflows/on_pr_open.docker-build.yml +++ b/.github/bkp/on_pr_open.docker-build.yml @@ -11,6 +11,11 @@ jobs: - uses: actions/checkout@v3 - name: Prepare Makefile run: sed -i 's/^IMAGE_TAG=[^ ]*/IMAGE_TAG=${{github.run_id}}/' Makefile + - name: Set up QEMU + uses: docker/setup-qemu-action@v3 + - name: Set up Docker Buildx + id: buildx + uses: docker/setup-buildx-action@v3 - name: Build the Docker image run: make docker_build pip_build: diff --git a/.github/workflows/on_pr_merged.yml b/.github/workflows/on_pr_merged.yml index 255b7ed..d270180 100644 --- a/.github/workflows/on_pr_merged.yml +++ b/.github/workflows/on_pr_merged.yml @@ -1,116 +1,32 @@ # needs: [tests] # require tests to pass before deploy runs -name: Build and Push - -# on: -# push: -# # Pattern matched against refs/tags -# tags: -# - '**' # Push events to every tag including hierarchical tags like v1.0/beta +name: Create Current Main Image on: - pull_request: - types: - - closed + push: branches: - main -# maybe trigger build/push on release tags? -# but this also works for my use case jobs: - docker_deploy: - if: github.event.pull_request.merged - runs-on: ubuntu-latest - # specify this to target the correct env/secrets to use - environment: 'Dockerhub' - steps: - - uses: actions/checkout@v3 - - name: Login to Dockerhub - uses: docker/login-action@v3 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_TOKEN }} - - name: Build the Docker image - run: make docker_build_latest - - name: Push Docker image - run: make docker_push_latest - - name: Update Dockerhub Documentation - uses: peter-evans/dockerhub-description@v3 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_TOKEN }} - repository: ${{ env.DOCKERHUB_REPO }} - pypi_deploy: - if: github.event.pull_request.merged + test: runs-on: ubuntu-latest - environment: 'PyPi' steps: - - uses: actions/checkout@v3 - - name: Set up Python - uses: actions/setup-python@v3 - with: - python-version: '3.x' - - name: Set tag version - run: | - TAG=$(cat Makefile | grep -E ^IMAGE_TAG=[0-9].[0-9].[0-9] | cut -d "=" -f2) - echo "VERSION=${TAG}" >> "$GITHUB_ENV" - echo "version from Makefile is: ${VERSION}" - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install build - - name: Update Release tag - run: sed -i "s/^version = [^ ]*/version = ${{ env.VERSION }}/" setup.cfg - - name: Build package - run: make build - - name: Publish package - uses: pypa/gh-action-pypi-publish@release/v1 - with: - password: ${{ secrets.PYPI_API_TOKEN }} - skip-existing: true - create_tag: - if: github.event.pull_request.merged - runs-on: ubuntu-latest - needs: - - docker_deploy - - pypi_deploy - permissions: - contents: write - steps: - - uses: actions/checkout@v3 - with: - ref: ${{ github.event.pull_request.merge_commit_sha }} - fetch-depth: '0' - - name: Set tag version - run: | - TAG=$(cat Makefile | grep -E ^IMAGE_TAG=[0-9].[0-9].[0-9] | cut -d "=" -f2) - echo "VERSION=v${TAG}" >> "$GITHUB_ENV" - echo "version from Makefile is: ${VERSION}" - - name: Create tag - uses: anothrNick/github-tag-action@1.64.0 - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # WITH_V: true - # PRERELEASE: true - CUSTOM_TAG: ${{ env.VERSION }} - create_release: - if: github.event.pull_request.merged + - name: Checkout + uses: actions/checkout@v4 + - name: Python Tests + uses: ./.github/actions/tests + # push to `main` image for testing/most up to date + docker-build: runs-on: ubuntu-latest - needs: - - create_tag - permissions: - contents: write + needs: test + environment: 'Dockerhub' + timeout-minutes: 10 steps: - - uses: actions/checkout@v3 - - name: Set tag version - run: | - TAG=$(cat Makefile | grep -E ^IMAGE_TAG=[0-9].[0-9].[0-9] | cut -d "=" -f2) - echo "VERSION=v${TAG}" >> "$GITHUB_ENV" - echo "version from Makefile is: ${VERSION}" - - name: Generate release - uses: ncipollo/release-action@v1 + - name: Checkout + uses: actions/checkout@v4 + - name: Run Docker Build + uses: ./.github/actions/docker with: - tag: ${{ env.VERSION }} - generateReleaseNotes: true - skipIfReleaseExists: true - # docker image tag latest + latest: false + dockerhub_username: ${{ secrets.DOCKERHUB_USERNAME }} + dockerhub_token: ${{ secrets.DOCKERHUB_TOKEN }} \ No newline at end of file diff --git a/.github/workflows/on_pr_open.yml b/.github/workflows/on_pr_open.yml new file mode 100644 index 0000000..6a776c3 --- /dev/null +++ b/.github/workflows/on_pr_open.yml @@ -0,0 +1,18 @@ +name: Test + +on: + pull_request: + branches: [ "main" ] + types: + - opened + - reopened + - synchronize + +jobs: + test: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Python Tests + uses: ./.github/actions/tests \ No newline at end of file diff --git a/.github/workflows/on_release.yml b/.github/workflows/on_release.yml new file mode 100644 index 0000000..59e11de --- /dev/null +++ b/.github/workflows/on_release.yml @@ -0,0 +1,92 @@ +--- +name: Create Official Release and Push Artifacts + +on: + push: + tags: + - v* + +permissions: + contents: write + +jobs: + tests: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Tests + uses: ./.github/actions/tests + create_release: + runs-on: ubuntu-latest + needs: tests + permissions: + contents: write + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Create Release + uses: ncipollo/release-action@v1 + with: + tag: ${{ github.ref_name }} + generateReleaseNotes: true + # build and push docker image + release-docker: + runs-on: ubuntu-latest + needs: + - tests + - create_release + environment: 'Dockerhub' + timeout-minutes: 10 + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Run Docker Build + uses: ./.github/actions/docker + with: + dockerhub_username: ${{ secrets.DOCKERHUB_USERNAME }} + dockerhub_token: ${{ secrets.DOCKERHUB_TOKEN }} + release-python: + runs-on: ubuntu-latest + needs: + - tests + - create_release + timeout-minutes: 20 + environment: 'PyPi' + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Get tag release without v + shell: bash + run: | + TAG=${{ github.ref_name }} + echo "VERSION=${TAG#v}" >> "$GITHUB_ENV" + echo "Tag without v is: ${VERSION}" + - name: Update Release Tag + shell: bash + run: sed -i "s/^version = [^ ]*/version = ${{ env.VERSION }}/" setup.cfg + - name: Set up Python + uses: actions/setup-python@v3 + with: + python-version: '3.13.2' + - name: Install Dependencies + shell: bash + run: | + python -m pip install --upgrade pip + pip install build + - name: Build Python Package + shell: bash + run: | + python -m pip install --upgrade build + python -m build + - name: Publish to PyPi + uses: pypa/gh-action-pypi-publish@release/v1 + with: + password: ${{ secrets.PYPI_API_TOKEN }} + skip-existing: true + # - name: Checkout + # uses: actions/checkout@v4 + # - name: Deploy release to PyPi + # uses: ./.github/actions/python + # with: + # pypi_api_token: "${{ secrets.PYPI_API_TOKEN }}" \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 707f8a5..695426b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,8 +1,13 @@ ARG BASE_IMAGE=python -ARG BASE_IMAGE_TAG=3.12-slim-python +ARG BASE_IMAGE_TAG=3.13.2-slim-bookworm FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} +LABEL \ + org.opencontainers.image.title="bookstack-file-exporter" \ + org.opencontainers.image.description="Page asset and content exporter for Bookstack" \ + org.opencontainers.image.source="https://github.com/homeylab/bookstack-file-exporter" + # Get security updates and clean up apt cache for smaller size RUN apt update -y && apt upgrade -y && \ apt install dumb-init && \ @@ -11,9 +16,9 @@ RUN apt update -y && apt upgrade -y && \ # create docker user RUN useradd -M -s /usr/sbin/nologin -u 33333 exporter -ARG DOCKER_WORK_DIR -ARG DOCKER_CONFIG_DIR -ARG DOCKER_EXPORT_DIR +ARG DOCKER_WORK_DIR=/export +ARG DOCKER_CONFIG_DIR=/export/config +ARG DOCKER_EXPORT_DIR=/export/dump ENV DOCKER_CONFIG_DIR=${DOCKER_CONFIG_DIR} ENV DOCKER_EXPORT_DIR=${DOCKER_EXPORT_DIR} diff --git a/Makefile b/Makefile index 08edc58..7fd717b 100644 --- a/Makefile +++ b/Makefile @@ -1,10 +1,10 @@ ## DOCKER BUILD VARS BASE_IMAGE=python -BASE_IMAGE_TAG=3.12-slim-bookworm +BASE_IMAGE_TAG=3.13.2-slim-bookworm IMAGE_NAME=homeylab/bookstack-file-exporter # keep this start sequence unique (IMAGE_TAG=) # github actions will use this to create a tag -IMAGE_TAG=1.0.2 +IMAGE_TAG=main DOCKER_WORK_DIR=/export DOCKER_CONFIG_DIR=/export/config DOCKER_EXPORT_DIR=/export/dump @@ -15,10 +15,13 @@ pip_build: pip_local_dev: python -m pip install -e . -build: +pip_build: python -m pip install --upgrade build python -m build +lint: + pylint bookstack_file_exporter + upload_testpypi: python -m pip install --upgrade twine python -m twine upload --repository testpypi dist/* @@ -27,8 +30,24 @@ upload_testpypi: download_testpypi: python -m pip install --index-url https://test.pypi.org/simple/ --extra-index-url https://pypi.org/simple bookstack-file-exporter +upload_realpypi: + python -m pip install --upgrade twine + python -m twine upload dist/* + +docker_build_simple: + docker build \ + --build-arg BASE_IMAGE=${BASE_IMAGE} \ + --build-arg BASE_IMAGE_TAG=${BASE_IMAGE_TAG} \ + --build-arg DOCKER_WORK_DIR=${DOCKER_WORK_DIR} \ + --build-arg DOCKER_CONFIG_DIR=${DOCKER_CONFIG_DIR} \ + --build-arg DOCKER_EXPORT_DIR=${DOCKER_EXPORT_DIR} \ + -t ${IMAGE_NAME}:${IMAGE_TAG} \ + --no-cache . + docker_build: docker buildx build \ + --platform linux/amd64,linux/arm64 \ + --output "type=image,push=false" \ --build-arg BASE_IMAGE=${BASE_IMAGE} \ --build-arg BASE_IMAGE_TAG=${BASE_IMAGE_TAG} \ --build-arg DOCKER_WORK_DIR=${DOCKER_WORK_DIR} \ @@ -39,6 +58,8 @@ docker_build: docker_build_latest: docker buildx build \ + --platform linux/amd64,linux/arm64 \ + --output "type=image,push=true" \ --build-arg BASE_IMAGE=${BASE_IMAGE} \ --build-arg BASE_IMAGE_TAG=${BASE_IMAGE_TAG} \ --build-arg DOCKER_WORK_DIR=${DOCKER_WORK_DIR} \ diff --git a/README.md b/README.md index ed5d612..5fa5732 100644 --- a/README.md +++ b/README.md @@ -6,21 +6,24 @@ Table of Contents - [Use Case](#use-case) - [Using This Application](#using-this-application) - [Run via Pip](#run-via-pip) - - [Run Via Docker](#run-via-docker) - - [Authentication](#authentication) + - [Run via Docker](#run-via-docker) + - [Run via Helm](#run-via-helm) + - [Authentication and Permissions](#authentication-and-permissions) - [Configuration](#configuration) - [Backup Behavior](#backup-behavior) - [General](#general) - [Images](#images) + - [Attachments](#attachments) - [Modify Markdown Files](#modify-markdown-files) - [Object Storage](#object-storage) - [Minio Backups](#minio-backups) + - [Potential Breaking Upgrades](#potential-breaking-upgrades) - [Future Items](#future-items) ## Background -_Features are actively being developed. See `Future Items` section for more details. Open an issue for a feature request._ +_If you encounter any issues, want to request an additional feature, or provide assistance, feel free to open a Github issue._ -This tool provides a way to export [Bookstack](https://github.com/BookStackApp/BookStack) pages and their content (_text, images, metadata, etc._) into a relational parent-child layout locally with an option to push to remote object storage locations. See [Backup Behavior](#backup-behavior) section for more details on how pages are organized. +This tool provides a way to export [Bookstack](https://github.com/BookStackApp/BookStack) pages and their content (_text, images, attachments, metadata, etc._) into a relational parent-child layout locally with an option to push to remote object storage locations. See [Backup Behavior](#backup-behavior) section for more details on how pages are organized. Image and attachment links can also be modified in markdown exports to point to local exported paths. This small project was mainly created to run as a cron job in k8s but works anywhere. This tool allows me to export my docs in markdown, or other formats like pdf. I use Bookstack's markdown editor as default instead of WYSIWYG editor and this makes my notes portable anywhere even if offline. @@ -29,13 +32,13 @@ What it does: - Discover and build relationships between Bookstack `Shelves/Books/Chapters/Pages` to create a relational parent-child layout - Export Bookstack pages and their content to a `.tgz` archive -- Additional content for pages like their images and metadata and can be exported -- The exporter can also [Modify Markdown Files](#modify-markdown-files) to replace image links with local exported image paths for a more portable backup +- Additional content for pages like their images, attachments, and metadata and can be exported +- The exporter can also [Modify Markdown Files](#modify-markdown-files) to replace image and/or attachment links with local exported paths for a more portable backup - YAML configuration file for repeatable and easy runs - Can be run via [Python](#run-via-pip) or [Docker](#run-via-docker) - Can push archives to remote object storage like [Minio](https://min.io/) - Basic housekeeping option (`keep_last`) to keep a tidy archive destination - +- Can run in application mode (always running) using `run_interval` property. Used for basic scheduling of backups. Supported backup targets are: @@ -57,7 +60,7 @@ The main use case is to backup all docs in a relational directory-tree format to 2. Offline copy wanted. 3. Back up at a file level as an accessory or alternative to disk and volume backups. 4. Migrate all Bookstack page contents to Markdown documenting for simplicity. -5. Provide an easy way to do automated file backups locally, in docker, or kubernetes for Bookstack page contents. +5. Provide an easy way to do automated file backups locally, in docker, or [kubernetes](https://github.com/homeylab/helm-charts/tree/main/charts/bookstack#file-exporter-backup-your-pages) for Bookstack page contents. ## Using This Application Ensure a valid configuration is provided when running this application. See [Configuration](#Configuration) section for more details. @@ -67,8 +70,8 @@ Simple example configuration: # config.yml host: "https://bookstack.yourdomain.com" credentials: - token_id: "" - token_secret: "" + token_id: "" + token_secret: "" formats: # md only example - markdown # - html @@ -76,19 +79,25 @@ formats: # md only example # - plaintext output_path: "bkps/" assets: - export_images: false - modify_markdown: false - export_meta: false - verify_ssl: true + export_images: false + export_attachments: false + modify_markdown: false + export_meta: false ``` ### Run via Pip The exporter can be installed via pip and run directly. +#### Python Version +_Note: This application is tested and developed on Python version `3.13.2`. The min required version is >= `3.8` but is recommended to install (or set up a venv) a `3.13.2` version._ + #### Examples ```bash python -m pip install bookstack-file-exporter +# if you prefer a specific version, example: +python -m pip install bookstack-file-exporter==X.X.X + # using pip python -m bookstack_file_exporter -c @@ -114,11 +123,17 @@ export LOG_LEVEL=debug python -m bookstack_file_exporter -c ``` -#### Python Version -_Note: This application is tested and developed on Python version `3.12.X`. The min required version is >= `3.8` but is recommended to install (or set up a venv) a `3.12.X` version._ +### Run via Docker +Docker images are provided for `linux/amd64` and `linux/arm64` variants only at the moment. If another variant is required, please request it via Github Issue. -### Run Via Docker -Docker can be utilized to run the exporter. +#### Tags +Users will generally want to use the `latest` tag or a specific version tag. The `main` tag is also provided but is not guaranteed to be stable. + +| tag | description | +| --- | ----------- | +| `latest` | Latest stable release and is updated with each new stable release. | +| `X.X.X` | Semantic versioned releases are also provided if preferred for stability or other reasons. | +| `main` | This tag reflects the `main` branch of this repository and may not be stable | #### Examples ```bash @@ -137,6 +152,10 @@ docker run \ homeylab/bookstack-file-exporter:latest ``` +#### Docker Compose +When using the configuration option: `run_interval`, a docker compose set up could be used to run the exporter as an always running application. The exporter will sleep and wait until `{run_interval}` seconds has elapsed before subsequent runs. + +An example is shown in `examples/docker-compose.yaml` #### Environment Variables See [Valid Environment Variables](#valid-environment-variables) for more options. @@ -162,9 +181,19 @@ docker run \ | `config` | `/export/config/config.yml` | A valid configuration file |`-v /local/yourpath/config.yml:/export/config/config.yml:ro`| | `dump` | `/export/dump` | Directory to place exports. **This is optional when using remote storage option(s)**. Omit if you don't need a local copy. | `-v /local/yourpath/bkps:/export/dump` | -### Authentication -**Note visibility of pages is based on user**, so use a user that has access to pages you want to back up. +### Run via Helm +A helm chart can be used to run the exporter as a CronJob or Deployment resource. See [here](https://github.com/homeylab/helm-charts/tree/main/charts/bookstack-file-exporter) for more information on using the helm chart. +### Authentication and Permissions +#### Permissions +**Note visibility of pages is based on user**, so use a user that has read access to pages and content you want to back up. *The role assigned to the user* should have the additional permissions for target pages and their content: +- `read` for all images and attachments + - For most users this may already be set - may be required to be set depending on storage option used +- `Export Content` (This can be found in `Edit Role --> System Permissions`) + - For most users this may already set - may be required to be set if using custom roles + - If not set, you may see page contents showing as a HTML login page, as reported in this [issue](https://github.com/homeylab/bookstack-file-exporter/issues/35) + +#### Token Authentication Ref: [https://demo.bookstackapp.com/api/docs#authentication](https://demo.bookstackapp.com/api/docs#authentication) Provide a tokenId and a tokenSecret as environment variables or directly in the configuration file. @@ -176,7 +205,9 @@ Env variables for credentials will take precedence over configuration file optio **For object storage authentication**, find the relevant sections further down in their respective sections. ### Configuration -_Ensure [Authentication](#authentication) has been set up beforehand for required credentials._ For a simple example to run quickly, refer to the one in the [Using This Application](#using-this-application) section. A full example is also shown below with descriptions. Optionally, look at `examples/` folder of the github repo for more examples. +_Ensure [Authentication](#authentication-and-permissions) has been set up beforehand for required credentials._ For a simple example to run quickly, refer to the one in the [Using This Application](#using-this-application) section. + +A full example is also shown below. Optionally, look at `examples/` folder of the github repo for more examples with long descriptions. For object storage configuration, find more information in their respective sections - [Minio](#minio-backups) @@ -189,17 +220,21 @@ Below is an example configuration that shows example values for all possible opt ```yaml host: "https://bookstack.yourdomain.com" credentials: - token_id: "" - token_secret: "" -additional_headers: - test: "test" - test2: "test2" - User-Agent: "test-agent" + token_id: "" + token_secret: "" formats: - markdown - html - pdf - plaintext +http_config: + verify_ssl: false + timeout: 30 + backoff_factor: 2.5 + retry_codes: [413, 429, 500, 502, 503, 504] + retry_count: 5 + additional_headers: + User-Agent: "test-agent" minio: host: "minio.yourdomain.com" access_key: "" @@ -211,10 +246,11 @@ minio: output_path: "bkps/" assets: export_images: true + export_attachments: true modify_markdown: false export_meta: false - verify_ssl: true keep_last: 5 +run_interval: 0 ``` #### Options and Descriptions @@ -222,24 +258,31 @@ More descriptions can be found for each section below: | Configuration Item | Type | Required | Description | | ------------------ | ---- | -------- | ----------- | -| `host` | `str` | `true` | If `http/https` not specified in the url, defaults to `https`. Use `assets.verify_ssl` to disable certificate checking. | +| `host` | `str` | `true` | If `http/https` not specified in the url, defaults to `https`. Use `http_config.verify_ssl` to disable certificate checking. | | `credentials` | `object` | `false` | Optional section where Bookstack tokenId and tokenSecret can be specified. Env variable for credentials may be supplied instead. See [Authentication](#authentication) for more details. | -| `credentials.token_id` | `str`| `true` if `credentials` | If `credentials` section is given, this should be a valid tokenId | -| `credentials.token_secret` | `str` | `true` if `credentials`| If `credentials` section is given, this should be a valid tokenSecret | -| `additional_headers` | `object` | `false` | Optional section where key/value for pairs can be specified to use in Bookstack http request headers. +| `credentials.token_id` | `str`| `false` if specified through env var instead, otherwise `true` | A valid Bookstack tokenId. | +| `credentials.token_secret` | `str` | `false` if specified through env var instead, otherwise `true` | A valid Bookstack tokenSecret. | | `formats` | `list` | `true` | Which export formats to use for Bookstack page content. Valid options are: `["markdown", "html", "pdf", "plaintext"]`| -| `output_path` | `str` | `false` | Optional (default: `cwd`) which directory (relative or full path) to place exports. User who runs the command should have access to read/write to this directory. If not provided, will use current run directory by default | +| `output_path` | `str` | `false` | Optional (default: `cwd`) which directory (relative or full path) to place exports. User who runs the command should have access to read/write to this directory. This directory and any parent directories will be attempted to be created if they do not exist. If not provided, will use current run directory by default. If using docker, this option can be omitted. | | `assets` | `object` | `false` | Optional section to export additional assets from pages. | | `assets.export_images` | `bool` | `false` | Optional (default: `false`), export all images for a page to an `image` directory within page directory. See [Backup Behavior](#backup-behavior) for more information on layout | -| `assets.modify_markdown` | `bool` | `false` | Optional (default: `false`), modify markdown files to replace image links with local exported image paths. This requires `assets.export_images` to be `true` in order to work. See [Modify Markdown Files](#modify-markdown-files) for more information. -| `assets.export_meta` | `bool` | `false` | Optional (default: `false`), export of metadata about the page in a json file | -| `assets.verify_ssl` | `bool` | `false` | Optional (default: `true`), whether or not to check ssl certificates when requesting content from Bookstack host | -| `keep_last` | `int` | `false` | Optional (default: `None`), if exporter can delete older archives. valid values are:
- set to `-1` if you want to delete all archives after each run (useful if you only want to upload to object storage)
- set to `1+` if you want to retain a certain number of archives
- `0` will result in no action done | +| `assets.export_attachments` | `bool` | `false` | Optional (default: `false`), export all attachments for a page to an `attachments` directory within page directory. See [Backup Behavior](#backup-behavior) for more information on layout | +| `assets.modify_markdown` | `bool` | `false` | Optional (default: `false`), modify markdown files to replace image links with local exported image paths. This requires `assets.export_images` to be `true` in order to work. See [Modify Markdown Files](#modify-markdown-files) for more information. | +| `assets.export_meta` | `bool` | `false` | Optional (default: `false`), export of metadata about the page in a json file. | +| `http_config` | `object` | `false` | Optional section to override default http configuration. | +| `http_config.verify_ssl` | `bool` | `false` | Optional (default: `false`), whether or not to verify ssl certificates if using https. | +| `http_config.timeout` | `int` | `false` | Optional (default: `30`), set the timeout, in seconds, for http requests. | +| `http_config.retry_count` | `int` | `false` | Optional (default: `5`), the number of http retries after initial failure. | +| `http_config.retry_codes` | `List[int]` | `false` | Optional (default: `[413, 429, 500, 502, 503, 504]`), which http response status codes trigger a retry. | +| `http_config.backoff_factor` | `float` | `false` | Optional (default: `2.5`), set the backoff_factor for http request retries. Default backoff_factor `2.5` means we wait 5, 10, 20, and then 40 seconds (with default `http_config.retry_count: 5`) before our last retry. This should allow for per minute rate limits to be refreshed. | +| `http_config.additional_headers` | `object` | `false` | Optional (default: `{}`), specify key/value pairs that will be added as additional headers to http requests. | +| `keep_last` | `int` | `false` | Optional (default: `0`), if exporter can delete older archives. valid values are:
- set to `-1` if you want to delete all archives after each run (useful if you only want to upload to object storage)
- set to `1+` if you want to retain a certain number of archives
- `0` will result in no action done. | +| `run_interval` | `int` | `false` | Optional (default: `0`). If specified, exporter will run as an application and pause for `{run_interval}` seconds before subsequent runs. Example: `86400` seconds = `24` hours or run once a day. Setting this property to `0` will invoke a single run and exit. Used for basic scheduling of backups. | | `minio` | `object` | `false` | Optional [Minio](#minio-backups) configuration options. | #### Valid Environment Variables General -- `LOG_LEVEL`: default: `info``. Provide a valid log level: info, debug, warning, error. +- `LOG_LEVEL`: default: `info`. Provide a valid log level: info, debug, warning, error. [Bookstack Credentials](#authentication) - `BOOKSTACK_TOKEN_ID` @@ -256,8 +299,12 @@ Backups are exported in `.tgz` format and generated based off timestamp. Export The exporter can also do housekeeping duties and keep a configured number of archives and delete older ones. See `keep_last` property in the [Configuration](#options-and-descriptions) section. Object storage provider configurations include their own `keep_last` property for flexibility. -For file names, `slug` names (from Bookstack API) are used, as such certain characters like `!`, `/` will be ignored and spaces replaced from page names/titles. +#### File Naming +For file names, `slug` names (from Bookstack API) are used, as such certain characters like `!`, `/` will be ignored and spaces replaced from page names/titles. If your page has an empty `slug` value for some reason (draft that was never fully saved), the exporter will use page name with the `slugify` function from Django to generate a valid slug. Example: `My Page.bin Name!` will be converted to `my-page-bin-name`. +You may also notice some directories (books) and/or files (pages) in the archive have a random string at the end, example - `nKA`: `user-and-group-management-nKA`. This is expected and is because there were resources with the same name created in another shelve and bookstack adds a string at the end to ensure uniqueness. + +#### Directory Layout All sub directories will be created as required during the export process. ``` Shelves --> Books --> Chapters --> Pages @@ -289,7 +336,7 @@ kafka-apps (shelf) ---> settings.md (page) ... -## Example with image layout +## Example with image and attachment layout # unassigned dir is used for books with no shelf unassigned (shelf) ---> test (book) @@ -300,12 +347,20 @@ unassigned (shelf) ---> rec-page ---> img-010.png ---> img-020.png + --> attachments (attachment_dir) + ---> test_page (page directory) + ---> something.config + ---> something_else.config + ---> rec-page + ---> test_output.log + ---> actual_output.log ---> test_page.md (page) ... ---> rec_page (page) ---> rec_page.md ---> rec_page.pdf ``` + Another example is shown below: ``` ## First example: @@ -320,13 +375,18 @@ bookstack_export_2023-11-28_06-24-25/programming/react/images/basics/dwwimage.pn bookstack_export_2023-11-28_06-24-25/programming/react/images/basics/NzZimage.png bookstack_export_2023-11-28_06-24-25/programming/react/images/nextjs/next1.png bookstack_export_2023-11-28_06-24-25/programming/react/images/nextjs/tips.png +bookstack_export_2023-11-28_06-24-25/programming/react/attachments/nextjs/sample.config +bookstack_export_2023-11-28_06-24-25/programming/react/attachments/nextjs/sample_output.log bookstack_export_2023-11-28_06-24-25/programming/react/nextjs.md bookstack_export_2023-11-28_06-24-25/programming/react/nextjs.pdf ``` Books without a shelf will be put in a shelve folder named `unassigned`. -Empty/New Pages will be ignored since they have not been modified yet from creation and are empty but also do not have a valid slug. Example: +#### Empty/New Pages +Empty/New Pages will be ignored since they have not been modified yet from creation and are empty but also do not have a valid slug. + +Example from Bookstack API: ``` { ... @@ -336,10 +396,7 @@ Empty/New Pages will be ignored since they have not been modified yet from creat } ``` -You may notice some directories (books) and/or files (pages) in the archive have a random string at the end, example - `nKA`: `user-and-group-management-nKA`. This is expected and is because there were resources with the same name created in another shelve and bookstack adds a string at the end to ensure uniqueness. - ### Images - Images will be dumped in a separate directory, `images` within the page parent (book/chapter) directory it belongs to. The relative path will be `{parent}/images/{page}/{image_name}`. As shown earlier: ``` @@ -351,22 +408,41 @@ bookstack_export_2023-11-28_06-24-25/programming/react/images/nextjs/tips.png **Note you may see old images in your exports. This is because, by default, Bookstack retains images/drawings that are uploaded even if no longer referenced on an active page. Admins can run `Cleanup Images` in the Maintenance Settings or via [CLI](https://www.bookstackapp.com/docs/admin/commands/#cleanup-unused-images) to remove them.** +If an API call to get an image or its metadata fails, the exporter will skip the image and log the error. If using `modify_markdown` option, the image links in the document will be untouched and in its original form. All API calls are retried 3 times after initial failure. + +### Attachments +Attachments will be dumped in a separate directory, `attachments` within the page parent (book/chapter) directory it belongs to. The relative path will be `{parent}/attachments/{page}/{attachment_name}`. As shown earlier: + +``` +bookstack_export_2023-11-28_06-24-25/programming/react/attachments/nextjs/sample.config +bookstack_export_2023-11-28_06-24-25/programming/react/attachments/nextjs/sample_package.json +... +... +``` + +**Note attachments that are just external links are ignored. Only attachments that are shown as `external: False` will be exported.** + +[Reference](https://demo.bookstackapp.com/api/docs#attachments-list) and excerpt from Bookstack API docs: +> Get a listing of attachments visible to the user. The external property indicates whether the attachment is simple a link. A false value for the external property would indicate a file upload. + +If an API call to get an attachment or its metadata fails, the exporter will skip the attachment and log the error. If using `modify_markdown` option, the attachment links in the document will be untouched and in its original form. All API calls are retried 3 times after initial failure. + ### Modify Markdown Files -**To use this feature, `assets.export_images` should be set to `true`** +**To use this feature, `assets.export_images` should be set to `true` and/or `assets.export_attachments`** -The configuration item, `assets.modify_markdown`, can be set to `true` to modify markdown files to replace image url links with local exported image paths. This feature allows for you to make your `markdown` exports much more portable. +The configuration item, `assets.modify_markdown`, can be set to `true` to modify markdown files to replace image and attachment url links with local exported image paths. This feature allows for you to make your `markdown` exports much more portable. -Page (parent) -> Images (Children) relationships are created and then each image url is replaced with its own respective local export path. Example: +Page (parent) -> Images (Children) relationships are created and then each image/attachment url is replaced with its own respective local export path. Example: ``` ## before [![pool-topology-1.png](https://demo.bookstack/uploads/images/gallery/2023-07/scaled-1680-/pool-topology-1.png)](https://demo.bookstack/uploads/images/gallery/2023-07/pool-topology-1.png) ## after -[![pool-topology-1.png](./images/{page_name}/pool-topology-1.png)](https://demo.bookstack/uploads/images/gallery/2023-07/pool-topology-1.png) +[![pool-topology-1.png](images/{page_name}/pool-topology-1.png)](https://demo.bookstack/uploads/images/gallery/2023-07/pool-topology-1.png) ``` -This allows the image to be found locally within the export files and allow your `markdown` docs to have all the images display properly like it would normally would. +This allows the image or attachment to be found locally within the export files and allow your `markdown` docs to have all the assets display properly like it would normally would. -**Note: This will work properly if your pages are using the notation used by Bookstack for Markdown image links, example: ` [![image alt text](Bookstack Markdown image URL link)](anchor/url link)` The `(anchor/url link)` is optional.** +**Note: This will work properly if your pages are using the notation used by Bookstack for Markdown image links, example: ` [![image alt text](Bookstack Markdown image URL link)](anchor/url link)` The `(anchor/url link)` is optional. For attachments the format is: `[file](url link)`** ## Object Storage Optionally, target(s) can be specified to upload generated archives to a remote location. Supported object storage providers can be found below: @@ -402,13 +478,20 @@ minio: | `access_key` | `str` | `false` if specified through env var instead, otherwise `true` | Access key for the minio instance | | `secret_key` | `str` | `false` if specified through env var, otherwise `true` | Secret key for the minio instance | | `path` | `str` | `false` | Optional, path of the backup to use. Will use root bucket path if not set. `://bookstack-.tgz` | -| `keep_last` | `int` | `false` | Optional (default: `None`), if exporter can delete older archives in minio.
- set to `1+` if you want to retain a certain number of archives
- `0` will result in no action done | +| `keep_last` | `int` | `false` | Optional (default: `0`), if exporter can delete older archives in minio.
- set to `1+` if you want to retain a certain number of archives
- `0` will result in no action done | + +## Potential Breaking Upgrades +Below are versions that have major changes to the way configuration or exporter runs. + +| Start Version | Target Version | Description | +| ------------- | -------------- | ----------- | +| `< 1.4.X` | `1.5.0` | `assets.verify_ssl` has been moved to `http_config.verify_ssl` and the default value has been updated to `false`. `additional_headers` has been moved to `http_config.additional_headers` | ## Future Items 1. ~~Be able to pull images locally and place in their respective page folders for a more complete file level backup.~~ 2. ~~Include the exporter in a maintained helm chart as an optional deployment. The helm chart is [here](https://github.com/homeylab/helm-charts/tree/main/charts/bookstack).~~ 3. ~~Be able to modify markdown links of images to local exported images in their respective page folders for a more complete file level backup.~~ -4. Be able to pull attachments locally and place in their respective page folders for a more complete file level backup. +4. ~~Be able to pull attachments locally and place in their respective page folders for a more complete file level backup.~~ 5. Export S3 and more options. 6. Filter shelves and books by name - for more targeted backups. Example: you only want to share a book about one topic with an external friend/user. 7. Be able to pull media/photos from 3rd party providers like `drawio` \ No newline at end of file diff --git a/bookstack_file_exporter/__main__.py b/bookstack_file_exporter/__main__.py index d91a0df..38fd292 100644 --- a/bookstack_file_exporter/__main__.py +++ b/bookstack_file_exporter/__main__.py @@ -9,7 +9,7 @@ def main(): args: argparse.Namespace = run_args.get_args() logging.basicConfig(format='%(asctime)s [%(levelname)s] %(message)s', level=run_args.get_log_level(args.log_level), datefmt='%Y-%m-%d %H:%M:%S') - run.exporter(args) + run.entrypoint(args) if __name__ == '__main__': diff --git a/bookstack_file_exporter/archiver/archiver.py b/bookstack_file_exporter/archiver/archiver.py index c6d01c5..098b1a0 100644 --- a/bookstack_file_exporter/archiver/archiver.py +++ b/bookstack_file_exporter/archiver/archiver.py @@ -5,10 +5,11 @@ from bookstack_file_exporter.exporter.node import Node from bookstack_file_exporter.archiver import util -from bookstack_file_exporter.archiver.page_archiver import PageArchiver, ImageNode +from bookstack_file_exporter.archiver.page_archiver import PageArchiver from bookstack_file_exporter.archiver.minio_archiver import MinioArchiver from bookstack_file_exporter.config_helper.remote import StorageProviderConfig from bookstack_file_exporter.config_helper.config_helper import ConfigNode +from bookstack_file_exporter.common.util import HttpHelper log = logging.getLogger(__name__) @@ -17,56 +18,47 @@ # pylint: disable=too-many-instance-attributes class Archiver: """ - Archiver pulls all the necessary files from upstream + Archiver helps handle archive duties: pulls all the necessary files from upstream and then pushes them to the specified backup location(s) Args: :config: = Configuration with user inputs and general options. + :http_client: = http helper functions with config from user inputs Returns: Archiver instance with attributes that are accessible for use for handling bookstack exports and remote uploads. """ - def __init__(self, config: ConfigNode): + def __init__(self, config: ConfigNode, http_client: HttpHelper): self.config = config # for convenience self.base_dir = config.base_dir_name self.archive_dir = self._generate_root_folder(self.base_dir) - self._page_archiver = self._generate_page_archiver() + self._page_archiver = PageArchiver(self.archive_dir, self.config, http_client) self._remote_exports = {'minio': self._archive_minio, 's3': self._archive_s3} + def create_export_dir(self): + """create directory for archiving""" + if not self.config.user_inputs.output_path: + log.info("No output path specified, using current directory for archive") + return + log.info("Creating base directory for archive: %s", + self.config.user_inputs.output_path) + # in docker, this may fail if the user id is not the same as the host + try: + util.create_dir(self.config.user_inputs.output_path) + except PermissionError as perm_err: + log.warning("Failed to create base directory: %s", perm_err) + log.warning("This usually occurs in docker environments " \ + "attempting to skip this step") + return def get_bookstack_exports(self, page_nodes: Dict[int, Node]): """export all page content""" log.info("Exporting all bookstack page contents") # get images first if requested # this is because we may want to manipulate page data with modify_markdown flag - all_image_meta = self._get_page_image_map() - for _, page in page_nodes.items(): - page_image_meta = [] - if page.id_ in all_image_meta: - page_image_meta = all_image_meta[page.id_] - self._get_page_files(page, page_image_meta) - self._get_page_images(page, page_image_meta) - - def _get_page_files(self, page_node: Node, image_meta: List[ImageNode]): - """pull all bookstack pages into local files/tar""" - log.debug("Exporting bookstack page data") - self._page_archiver.archive_page(page_node, image_meta) - - def _get_page_image_map(self) -> Dict[int, ImageNode]: - if not self._page_archiver.export_images: - log.debug("skipping image export based on user input") - return {} - return self._page_archiver.get_image_meta() - - def _get_page_images(self, page_node: Node, img_nodes: List[ImageNode]): - if not img_nodes: - log.debug("page has no images to pull") - return - log.debug("Exporting bookstack page images") - self._page_archiver.archive_page_images(page_node.parent.file_path, - page_node.name, img_nodes) + self._page_archiver.archive_pages(page_nodes) def create_archive(self): """create tgz archive""" @@ -145,10 +137,6 @@ def _delete_files(self, file_list: List[str]): for file in file_list: util.remove_file(file) - def _generate_page_archiver(self)-> PageArchiver: - return PageArchiver(self.archive_dir, self.config) - - @staticmethod def _generate_root_folder(base_folder_name: str) -> str: """return base archive name""" diff --git a/bookstack_file_exporter/archiver/asset_archiver.py b/bookstack_file_exporter/archiver/asset_archiver.py new file mode 100644 index 0000000..c119e7f --- /dev/null +++ b/bookstack_file_exporter/archiver/asset_archiver.py @@ -0,0 +1,197 @@ +import logging +import base64 +from typing import Union, List, Dict +from re import sub as re_sub +# pylint: disable=import-error +from requests import Response + +from bookstack_file_exporter.common.util import HttpHelper + +log = logging.getLogger(__name__) + +_IMAGE_DIR_NAME = "images" +_ATTACHMENT_DIR_NAME = "attachments" + + +class AssetNode: + """ + Base class for other asset nodes. This class should not be used directly. + + Args: + :meta_data: = asset meta data + + Returns: + AssetNode instance for use in other classes + """ + def __init__(self, meta_data: Dict[str, int | str | bool]): + self.id_: int = meta_data['id'] + self.page_id: int = meta_data['uploaded_to'] + self.url: str = "" + self.name: str = "" + self._markdown_str = "" + self._relative_path_prefix: str = "" + + def get_relative_path(self, page_name: str) -> str: + """image path local to page directory""" + return f"{self._relative_path_prefix}/{page_name}/{self.name}" + + @property + def markdown_str(self): + """return markdown url str to replace""" + return self._markdown_str + + def set_markdown_content(self, asset_data: Dict[str, int | str | bool]) -> None: + """set markdown url str to replace""" + self._markdown_str = self._get_md_url_str(asset_data) + + @staticmethod + def _get_md_url_str(asset_data: Dict[str, Union[int, str]]) -> str: + url_str = "" + if 'content' in asset_data: + if 'markdown' in asset_data['content']: + url_str = asset_data['content']['markdown'] + # check to see if empty before doing find + if not url_str: + return "" + # find the link between two parenthesis + # - markdown format + return url_str[url_str.find("(")+1:url_str.find(")")] + +class ImageNode(AssetNode): + """ + ImageNode handles image meta data and markdown url replacement. + + Args: + :meta_data: = image meta data + + Returns: + ImageNode instance for use in archiving images for a page + """ + def __init__(self, meta_data: Dict[str, Union[int, str]]): + super().__init__(meta_data) + self.url: str = meta_data['url'] + self.name: str = self.url.split('/')[-1] + log.debug("Image node has generated url: %s", self.url) + self._relative_path_prefix = f"{_IMAGE_DIR_NAME}" + +class AttachmentNode(AssetNode): + """ + AttachmentNode handles attachment meta data and markdown url replacement. + + Args: + :meta_data: = attachment meta data + :base_url: = base url for attachment download + + Returns: + AttachmentNode instance for use in archiving attachments for a page + """ + def __init__(self, meta_data: Dict[str, Union[int, str, bool]], + base_url: str): + super().__init__(meta_data) + self.url: str = f"{base_url}/{self.id_}" + self.name = meta_data['name'] + log.debug("Attachment node has generated url: %s", self.url) + self._relative_path_prefix = f"{_ATTACHMENT_DIR_NAME}" + + @staticmethod + def _get_md_url_str(asset_data: Dict[str, int | str | dict]) -> str: + url_str = "" + if 'links' in asset_data: + if 'markdown' in asset_data['links']: + url_str = asset_data['links']['markdown'] + # check to see if empty before doing find + if not url_str: + return "" + # find the link between two parenthesis + # - markdown format + return url_str[url_str.find("(")+1:url_str.find(")")] + +class AssetArchiver: + """ + AssetArchiver handles image and attachment exports for a page. + + Args: + :urls: = api urls for images and attachments + :http_client: = http helper functions with config from user inputs + + Returns: + AssetArchiver instance for use in archiving images and attachments for a page + """ + def __init__(self, urls: Dict[str, str], http_client: HttpHelper): + self.api_urls = urls + self._asset_map = { + 'images': self._create_image_map, + 'attachments': self._create_attachment_map + } + self.http_client = http_client + + def get_asset_nodes(self, asset_type: str) -> Dict[str, ImageNode | AttachmentNode]: + """Get image or attachment helpers for a page""" + asset_response: Response = self.http_client.http_get_request( + self.api_urls[asset_type]) + asset_json = asset_response.json()['data'] + return self._asset_map[asset_type](asset_json) + + def get_asset_data(self, asset_type: str, + meta_data: Union[AttachmentNode, ImageNode]) -> Dict[str, str | bool | int | dict]: + """Get asset data based on type""" + data_url = f"{self.api_urls[asset_type]}/{meta_data.id_}" + asset_data_response: Response = self.http_client.http_get_request( + data_url) + return asset_data_response.json() + + def get_asset_bytes(self, asset_type: str, url: str) -> bytes: + """Get raw asset data""" + asset_response: Response = self.http_client.http_get_request( + url) + match asset_type: + case "images": + asset_data = asset_response.content + case "attachments": + asset_data = self._decode_attachment_data(asset_response.json()['content']) + return asset_data + + def update_asset_links(self, asset_type, page_name: str, page_data: bytes, + asset_nodes: List[ImageNode | AttachmentNode]) -> bytes: + """update markdown links in page data""" + for asset_node in asset_nodes: + # get metadata instead of raw data/bytes + asset_data = self.get_asset_data(asset_type, asset_node) + asset_node.set_markdown_content(asset_data) + if not asset_node.markdown_str: + continue + page_data = re_sub(asset_node.markdown_str.encode(), + asset_node.get_relative_path(page_name).encode(), page_data) + return page_data + + @staticmethod + def _create_image_map(json_data: Dict[str, + List[Dict[str, str | int | bool | dict]]]) -> Dict[int, List[ImageNode]]: + image_page_map = {} + for img_meta in json_data: + img_node = ImageNode(img_meta) + if img_node.page_id in image_page_map: + image_page_map[img_node.page_id].append(img_node) + else: + image_page_map[img_node.page_id] = [img_node] + return image_page_map + + def _create_attachment_map(self, + json_data: Dict[str, List[Dict[str, str | int | bool | dict]]]) -> List[AssetNode]: + asset_nodes = {} + for asset_meta in json_data: + asset_node = None + if asset_meta['external']: + continue # skip external link, only get attachments + asset_node = AttachmentNode(asset_meta, self.api_urls['attachments']) + if asset_node.page_id in asset_nodes: + asset_nodes[asset_node.page_id].append(asset_node) + else: + asset_nodes[asset_node.page_id] = [asset_node] + return asset_nodes + + @staticmethod + def _decode_attachment_data(b64encoded_data: str) -> bytes: + """decode base64 encoded data""" + asset_data = b64encoded_data.encode() + return base64.b64decode(asset_data) diff --git a/bookstack_file_exporter/archiver/page_archiver.py b/bookstack_file_exporter/archiver/page_archiver.py index 812da99..0b6b745 100644 --- a/bookstack_file_exporter/archiver/page_archiver.py +++ b/bookstack_file_exporter/archiver/page_archiver.py @@ -1,12 +1,14 @@ from typing import Union, List, Dict -import re +import logging # pylint: disable=import-error -from requests import Response - +from requests.exceptions import HTTPError from bookstack_file_exporter.exporter.node import Node from bookstack_file_exporter.archiver import util as archiver_util +from bookstack_file_exporter.archiver.asset_archiver import AssetArchiver, ImageNode, AttachmentNode from bookstack_file_exporter.config_helper.config_helper import ConfigNode -from bookstack_file_exporter.common import util as common_util +from bookstack_file_exporter.common.util import HttpHelper + +log = logging.getLogger(__name__) _META_FILE_SUFFIX = "_meta.json" _TAR_SUFFIX = ".tar" @@ -24,75 +26,24 @@ "tgz": _TAR_GZ_SUFFIX } -_IMAGE_DIR_NAME = "images" -_MARKDOWN_STR_CHECK = "markdown" - -class ImageNode: - """ - ImageNode provides metadata and convenience for Bookstack images. - - Args: - :img_meta_data: = image meta data - - Returns: - :ImageNode: instance with attributes to help handle images. - """ - def __init__(self, img_meta_data: Dict[str, Union[int, str]]): - self.id: int = img_meta_data['id'] - self.page_id: int = img_meta_data['uploaded_to'] - self.url: str = img_meta_data['url'] - self.name: str = self._get_image_name() - self._markdown_str = "" - self._relative_path_prefix: str = f"./{_IMAGE_DIR_NAME}" - - def _get_image_name(self) -> str: - return self.url.split('/')[-1] - - def get_image_relative_path(self, page_name: str) -> str: - """return image path local to page directory""" - return f"{self._relative_path_prefix}/{page_name}/{self.name}" - - def set_markdown_content(self, img_details: Dict[str, Union[int, str]]): - """provide image metadata to set markdown properties""" - self._markdown_str = self._get_md_url_str(img_details) - - @property - def markdown_str(self): - """return markdown url str to replace""" - return self._markdown_str - - @staticmethod - def _get_md_url_str(img_data: Dict[str, Union[int, str]]) -> str: - url_str = "" - if 'content' in img_data: - if _MARKDOWN_STR_CHECK in img_data['content']: - url_str = img_data['content'][_MARKDOWN_STR_CHECK] - # check to see if empty before doing find - if not url_str: - return "" - # find the link between two parenthesis - # - markdown format - return url_str[url_str.find("(")+1:url_str.find(")")] - # pylint: disable=too-many-instance-attributes class PageArchiver: """ PageArchiver handles all data extraction and modifications - to Bookstack page contents including images. + to Bookstack page contents including assets like images or attachments. Args: :archive_dir: = directory where data will be put into. - :config: = Configuration with user inputs and general options. + :http_client: = http helper functions with config from user inputs Returns: :PageArchiver: instance with methods to help collect page content from a Bookstack instance. """ - def __init__(self, archive_dir: str, config: ConfigNode) -> None: + def __init__(self, archive_dir: str, config: ConfigNode, http_client: HttpHelper) -> None: self.asset_config = config.user_inputs.assets self.export_formats = config.user_inputs.formats self.api_urls = config.urls - self._headers = config.headers # full path, bookstack-, and .tgz extension self.archive_file = f"{archive_dir}{_FILE_EXTENSION_MAP['tgz']}" # name of intermediate tar file before gzip @@ -100,59 +51,114 @@ def __init__(self, archive_dir: str, config: ConfigNode) -> None: # name of the base folder to use within the tgz archive (internal tar layout) self.archive_base_path = archive_dir.split("/")[-1] self.modify_md: bool = self._check_md_modify() + self.asset_archiver = AssetArchiver(self.api_urls, + http_client) + self.http_client = http_client def _check_md_modify(self) -> bool: # check to ensure they have asset_config defined, could be None - if _MARKDOWN_STR_CHECK in self.export_formats: - return self.asset_config.modify_markdown and self.export_images + if 'markdown' in self.export_formats: + return self.asset_config.modify_markdown and \ + ( self.export_images or self.export_attachments) return False - def archive_page(self, page: Node, - image_urls: List[str] = None): - """export page content""" - for export_format in self.export_formats: - page_data = self._get_page_data(page.id_, export_format) - self._archive_page(page, export_format, - page_data, image_urls) - if self.asset_config.export_meta: - self._archive_page_meta(page.file_path, page.meta) - - def _archive_page(self, page: Node, export_format: str, data: bytes, - image_nodes: List[ImageNode] = None): + def archive_pages(self, page_nodes: Dict[int, Node]): + """export page contents and their images/attachments""" + # get assets first if requested + # this is because we may want to manipulate page data with modify_markdown flag + image_nodes = self._get_image_meta() + attachment_nodes = self._get_attachment_meta() + for _, page in page_nodes.items(): + page_images = [] + page_attachments = [] + if page.id_ in image_nodes: + page_images = image_nodes[page.id_] + if page.id_ in attachment_nodes: + page_attachments = attachment_nodes[page.id_] + failed_images = self.archive_page_assets("images", page.parent.file_path, + page.name, page_images) + failed_attach = self.archive_page_assets("attachments", page.parent.file_path, + page.name, page_attachments) + # exclude from page_images + # so it doesn't attempt to get modified in markdown file + if failed_images: + page_images = [img for img in page_images if img.id_ not in failed_images] + # exclude from page_attachments + # so it doesn't attempt to get modified in markdown file + if failed_attach: + page_attachments = [attach for attach in page_attachments + if attach.id_ not in failed_attach] + for export_format in self.export_formats: + page_data = self._get_page_data(page.id_, export_format) + if page_images and export_format == 'markdown': + page_data = self._modify_markdown("images", page.name, + page_data, page_images) + if page_attachments and export_format == 'markdown': + page_data = self._modify_markdown("attachments", page.name, + page_data, page_attachments) + self._archive_page(page, export_format, + page_data) + if self.asset_config.export_meta: + self._archive_page_meta(page.file_path, page.meta) + + def _archive_page(self, page: Node, export_format: str, data: bytes): page_file_name = f"{self.archive_base_path}/" \ f"{page.file_path}{_FILE_EXTENSION_MAP[export_format]}" - if self.modify_md and export_format == _MARKDOWN_STR_CHECK and image_nodes: - data = self._update_image_links(page.name, data, image_nodes) self.write_data(page_file_name, data) - def _get_page_data(self, page_id: int, export_format: str): + def _get_page_data(self, page_id: int, export_format: str) -> bytes: url = f"{self.api_urls['pages']}/{page_id}/{_EXPORT_API_PATH}/{export_format}" - return archiver_util.get_byte_response(url=url, headers=self._headers, - verify_ssl=self.verify_ssl) + return archiver_util.get_byte_response(url=url, + http_client=self.http_client) def _archive_page_meta(self, page_path: str, meta_data: Dict[str, Union[str, int]]): meta_file_name = f"{self.archive_base_path}/{page_path}{_FILE_EXTENSION_MAP['meta']}" bytes_meta = archiver_util.get_json_bytes(meta_data) self.write_data(file_path=meta_file_name, data=bytes_meta) - def get_image_meta(self) -> Dict[int, List[ImageNode]]: + def _get_image_meta(self) -> Dict[int, List[ImageNode]]: """Get all image metadata into a {page_number: [image_url]} format""" - img_meta_response: Response = common_util.http_get_request( - self.api_urls['images'], - self._headers, - self.verify_ssl) - img_meta_json = img_meta_response.json()['data'] - return self._create_image_map(img_meta_json) - - def archive_page_images(self, parent_path: str, page_name: str, - image_nodes: List[ImageNode]): + if not self.asset_config.export_images: + return {} + return self.asset_archiver.get_asset_nodes('images') + + def _get_attachment_meta(self) -> Dict[int, List[AttachmentNode]]: + """Get all attachment metadata into a {page_number: [attachment_url]} format""" + if not self.asset_config.export_attachments: + return {} + return self.asset_archiver.get_asset_nodes('attachments') + + def _modify_markdown(self, asset_type: str, + page_name: str, page_data: bytes, + asset_nodes: List[ImageNode | AttachmentNode]) -> bytes: + if not self.modify_md: + return page_data + return self.asset_archiver.update_asset_links(asset_type, page_name, page_data, + asset_nodes) + + def archive_page_assets(self, asset_type: str, parent_path: str, page_name: str, + asset_nodes: List[ImageNode | AttachmentNode]) -> Dict[int, int]: """pull images locally into a directory based on page""" - image_base_path = f"{self.archive_base_path}/{parent_path}/{_IMAGE_DIR_NAME}" - for img_node in image_nodes: - img_data: bytes = archiver_util.get_byte_response(img_node.url, self._headers, - self.verify_ssl) - image_path = f"{image_base_path}/{page_name}/{img_node.name}" - self.write_data(image_path, img_data) + if not asset_nodes: + return {} + # use a map for faster lookup + failed_assets = {} + node_base_path = f"{self.archive_base_path}/{parent_path}" + for asset_node in asset_nodes: + try: + asset_data = self.asset_archiver.get_asset_bytes(asset_type, asset_node.url) + except HTTPError: + # probably unnecessary, but just in case + if asset_node.id_ not in failed_assets: + failed_assets[asset_node.id_] = 0 + # a 404 or other error occurred + # skip this asset + log.error("Failed to get image or attachment data " \ + "for asset located at: %s - skipping", asset_node.url) + continue + asset_path = f"{node_base_path}/{asset_node.get_relative_path(page_name)}" + self.write_data(asset_path, asset_data) + return failed_assets def write_data(self, file_path: str, data: bytes): """write data to a tar file @@ -167,21 +173,6 @@ def gzip_archive(self): """provide the tar to gzip and the name of the gzip output file""" archiver_util.create_gzip(self.tar_file, self.archive_file) - def _update_image_links(self, page_name: str, page_data: bytes, - image_nodes: List[ImageNode]) -> bytes: - """regex replace links to local created directories""" - for img_node in image_nodes: - img_meta_url = f"{self.api_urls['images']}/{img_node.id}" - img_details = common_util.http_get_request(img_meta_url, - self._headers, self.verify_ssl) - img_node.set_markdown_content(img_details.json()) - if not img_node.markdown_str: - continue - # 1 - what to replace, 2 - replace with, 3 is the data to replace - page_data = re.sub(img_node.markdown_str.encode(), - img_node.get_image_relative_path(page_name).encode(), page_data) - return page_data - @property def file_extension_map(self) -> Dict[str, str]: """file extension metadata""" @@ -192,23 +183,12 @@ def export_images(self) -> bool: """return whether or not to export images""" return self.asset_config.export_images + @property + def export_attachments(self) -> bool: + """return whether or not to export attachments""" + return self.asset_config.export_attachments + @property def verify_ssl(self) -> bool: """return whether or not to verify ssl for http requests""" return self.asset_config.verify_ssl - - # @staticmethod - # def _get_regex_expr(url: str) -> bytes: - # # regex_str = fr"\[\!\[^$|.*\]\({url}\)\]" - # return re.compile(regex_str.encode()) - - @staticmethod - def _create_image_map(json_data: List[Dict[str, Union[str,int]]]) -> Dict[int, List[ImageNode]]: - image_page_map = {} - for img_meta in json_data: - img_node = ImageNode(img_meta) - if img_node.page_id in image_page_map: - image_page_map[img_node.page_id].append(img_node) - else: - image_page_map[img_node.page_id] = [img_node] - return image_page_map diff --git a/bookstack_file_exporter/archiver/util.py b/bookstack_file_exporter/archiver/util.py index 0980fb6..21cf8af 100644 --- a/bookstack_file_exporter/archiver/util.py +++ b/bookstack_file_exporter/archiver/util.py @@ -7,14 +7,15 @@ from io import BytesIO import gzip import glob +from pathlib import Path -from bookstack_file_exporter.common import util +from bookstack_file_exporter.common.util import HttpHelper log = logging.getLogger(__name__) -def get_byte_response(url: str, headers: Dict[str, str], verify_ssl: bool) -> bytes: +def get_byte_response(url: str, http_client: HttpHelper) -> bytes: """get byte response from http request""" - response = util.http_get_request(url=url, headers=headers, verify_ssl=verify_ssl) + response = http_client.http_get_request(url=url) return response.content # append to a tar file instead of creating files locally and then tar'ing after @@ -48,3 +49,7 @@ def scan_archives(base_dir: str, extension: str) -> str: """scan export directory for archives""" file_pattern = f"{base_dir}_*{extension}" return glob.glob(file_pattern) + +def create_dir(dir_path: str): + """create a directory if not exists""" + Path(dir_path).mkdir(parents=True, exist_ok=True) diff --git a/bookstack_file_exporter/common/util.py b/bookstack_file_exporter/common/util.py index 4591e9d..02cd807 100644 --- a/bookstack_file_exporter/common/util.py +++ b/bookstack_file_exporter/common/util.py @@ -1,44 +1,72 @@ import logging from typing import Dict +import urllib3 # pylint: disable=import-error import requests # pylint: disable=import-error from requests.adapters import HTTPAdapter, Retry +from bookstack_file_exporter.config_helper.models import HttpConfig + log = logging.getLogger(__name__) -def http_get_request(url: str, headers: Dict[str, str], - verify_ssl: bool, timeout: int = 30) -> requests.Response: - """make http requests and return response object""" - url_prefix = should_verify(url) - try: - with requests.Session() as session: - # {backoff factor} * (2 ** ({number of previous retries})) - # {raise_on_status} if status falls in status_forcelist range - # and retries have been exhausted. - # {status_force_list} 429 is supposed to be included - retries = Retry(total=3, - backoff_factor=0.5, - raise_on_status=True, - status_forcelist=[ 500, 502, 503, 504 ]) - session.mount(url_prefix, HTTPAdapter(max_retries=retries)) - response = session.get(url, headers=headers, verify=verify_ssl, timeout=timeout) - except Exception as req_err: - log.error("Failed to make request for %s", url) - raise req_err - try: - #raise_for_status() throws an exception on codes 400-599 - response.raise_for_status() - except requests.exceptions.HTTPError as e: - # this means it either exceeded 50X retries in `http_get_request` handler - # or it returned a 40X which is not expected - log.error("Bookstack request failed with status code: %d on url: %s", - response.status_code, url) - raise e - return response - -def should_verify(url: str) -> str: - """check if http or https""" - if url.startswith("https"): - return "https://" - return "http://" +# disable TLS warnings if using verify_ssl=false +urllib3.disable_warnings() + +class HttpHelper: + """ + HttpHelper provides an http request helper with config stored and retries built in + + Args: + :headers: = all headers to use for http requests + :config: = Configuration with user inputs for http requests + + Returns: + :HttpHelper: instance with methods to help with http requests. + """ + def __init__(self, headers: Dict[str, str], + config: HttpConfig): + self.backoff_factor = config.backoff_factor + self.retry_codes = config.retry_codes + self.retry_count = config.retry_count + self.http_timeout = config.timeout + self.verify_ssl = config.verify_ssl + self._headers = headers + + # more details on options: https://urllib3.readthedocs.io/en/stable/reference/urllib3.util.html + def http_get_request(self, url: str) -> requests.Response: + """make http requests and return response object""" + url_prefix = self.should_verify(url) + try: + with requests.Session() as session: + # {backoff factor} * (2 ** ({number of previous retries})) + # {raise_on_status} if status falls in status_forcelist range + # and retries have been exhausted. + # {status_force_list} 413, 429, 503 defaults are overwritten with additional ones + retries = Retry(total=self.retry_count, + backoff_factor=self.backoff_factor, + raise_on_status=True, + status_forcelist=self.retry_codes) + session.mount(url_prefix, HTTPAdapter(max_retries=retries)) + response = session.get(url, headers=self._headers, verify=self.verify_ssl, + timeout=self.http_timeout) + except Exception as req_err: + log.error("Failed to make request for %s", url) + raise req_err + try: + #raise_for_status() throws an exception on codes 400-599 + response.raise_for_status() + except requests.exceptions.HTTPError as e: + # this means it either exceeded 50X retries in `http_get_request` handler + # or it returned a 40X which is not expected + log.error("Bookstack request failed with status code: %d on url: %s", + response.status_code, url) + raise e + return response + + @staticmethod + def should_verify(url: str) -> str: + """check if http or https""" + if url.startswith("https"): + return "https://" + return "http://" diff --git a/bookstack_file_exporter/config_helper/config_helper.py b/bookstack_file_exporter/config_helper/config_helper.py index 46911b2..a1c3fe0 100644 --- a/bookstack_file_exporter/config_helper/config_helper.py +++ b/bookstack_file_exporter/config_helper/config_helper.py @@ -19,7 +19,8 @@ "books": "api/books", "chapters": "api/chapters", "pages": "api/pages", - "images": "api/image-gallery" + "images": "api/image-gallery", + "attachments": "api/attachments" } _UNASSIGNED_BOOKS_DIR = "unassigned/" @@ -56,8 +57,6 @@ def __init__(self, args: argparse.Namespace): self._token_id, self._token_secret = self._generate_credentials() self._headers = self._generate_headers() self._urls = self._generate_urls() - self._minio_access_key = "" - self._minio_secret_key = "" self._object_storage_config = self._generate_remote_config() def _generate_config(self, config_file: str) -> models.UserInput: @@ -80,11 +79,8 @@ def _generate_config(self, config_file: str) -> models.UserInput: def _generate_credentials(self) -> Tuple[str, str]: # if user provided credentials in config file, load them - token_id = "" - token_secret = "" - if self.user_inputs.credentials: - token_id = self.user_inputs.credentials.token_id - token_secret = self.user_inputs.credentials.token_secret + token_id = self.user_inputs.credentials.token_id + token_secret = self.user_inputs.credentials.token_secret # check to see if env var is specified, if so, it takes precedence token_id = self._check_var(_BOOKSTACK_TOKEN_FIELD, token_id) @@ -99,15 +95,20 @@ def _generate_remote_config(self) -> Dict[str, StorageProviderConfig]: self.user_inputs.minio.access_key) minio_secret_key = self._check_var(_MINIO_SECRET_KEY_FIELD, self.user_inputs.minio.secret_key) + object_config["minio"] = StorageProviderConfig(minio_access_key, minio_secret_key, self.user_inputs.minio) + for platform, config in object_config.items(): + if not config.is_valid(platform): + error_str = "provided " + platform + " configuration is invalid" + raise ValueError(error_str) return object_config def _generate_headers(self) -> Dict[str, str]: headers = {} # add additional_headers provided by user - if self.user_inputs.additional_headers: - for key, value in self.user_inputs.additional_headers.items(): + if self.user_inputs.http_config.additional_headers: + for key, value in self.user_inputs.http_config.additional_headers.items(): headers[key] = value # add default headers diff --git a/bookstack_file_exporter/config_helper/models.py b/bookstack_file_exporter/config_helper/models.py index 8b37b3d..2e89649 100644 --- a/bookstack_file_exporter/config_helper/models.py +++ b/bookstack_file_exporter/config_helper/models.py @@ -5,37 +5,46 @@ # pylint: disable=too-few-public-methods class ObjectStorageConfig(BaseModel): """YAML schema for minio configuration""" - host: str - access_key: Optional[str] = None - secret_key: Optional[str] = None + host: Optional[str] = "" + access_key: Optional[str] = "" + secret_key: Optional[str] = "" bucket: str - path: Optional[str] = None + path: Optional[str] = "" region: str - keep_last: Optional[int] = None + keep_last: Optional[int] = 0 # pylint: disable=too-few-public-methods class BookstackAccess(BaseModel): """YAML schema for bookstack access credentials""" - token_id: str - token_secret: str + token_id: Optional[str] = "" + token_secret: Optional[str] = "" # pylint: disable=too-few-public-methods class Assets(BaseModel): """YAML schema for bookstack markdown asset(pages/images/attachments) configuration""" export_images: Optional[bool] = False + export_attachments: Optional[bool] = False modify_markdown: Optional[bool] = False export_meta: Optional[bool] = False - verify_ssl: Optional[bool] = True + +class HttpConfig(BaseModel): + """YAML schema for user provided http settings""" + verify_ssl: Optional[bool] = False + timeout: Optional[int] = 30 + backoff_factor: Optional[float] = 2.5 + retry_codes: Optional[List[int]] = [413, 429, 500, 502, 503, 504] + retry_count: Optional[int] = 5 + additional_headers: Optional[Dict[str, str]] = {} # pylint: disable=too-few-public-methods class UserInput(BaseModel): """YAML schema for user provided configuration file""" host: str - additional_headers: Optional[Dict[str, str]] = None - credentials: Optional[BookstackAccess] = None + credentials: Optional[BookstackAccess] = BookstackAccess() formats: List[Literal["markdown", "html", "pdf", "plaintext"]] - output_path: Optional[str] = None - # export_meta: Optional[bool] = None + output_path: Optional[str] = "" assets: Optional[Assets] = Assets() minio: Optional[ObjectStorageConfig] = None - keep_last: Optional[int] = None + keep_last: Optional[int] = 0 + run_interval: Optional[int] = 0 + http_config: Optional[HttpConfig] = HttpConfig() diff --git a/bookstack_file_exporter/config_helper/remote.py b/bookstack_file_exporter/config_helper/remote.py index d97dd04..ddcc717 100644 --- a/bookstack_file_exporter/config_helper/remote.py +++ b/bookstack_file_exporter/config_helper/remote.py @@ -1,5 +1,9 @@ +import logging + from bookstack_file_exporter.config_helper.models import ObjectStorageConfig +log = logging.getLogger(__name__) + ## convenience class ## able to work for minio, s3, etc. class StorageProviderConfig: @@ -21,6 +25,7 @@ def __init__(self, access_key: str, secret_key: str, config: ObjectStorageConfig self.config = config self._access_key = access_key self._secret_key = secret_key + self._valid_checker = {'minio': self._is_minio_valid()} @property def access_key(self) -> str: @@ -31,3 +36,19 @@ def access_key(self) -> str: def secret_key(self) -> str: """return secret key for use""" return self._secret_key + + def is_valid(self, storage_type: str) -> bool: + """check if object storage config is valid""" + return self._valid_checker[storage_type] + + def _is_minio_valid(self) -> bool: + """check if minio config is valid""" + # required values - keys and bucket already checked so skip + checks = { + "host": self.config.host + } + for prop, check in checks.items(): + if not check: + log.error("%s is missing from minio configuration and is required", prop) + return False + return True diff --git a/bookstack_file_exporter/exporter/exporter.py b/bookstack_file_exporter/exporter/exporter.py index 910c30c..fd1b3ed 100644 --- a/bookstack_file_exporter/exporter/exporter.py +++ b/bookstack_file_exporter/exporter/exporter.py @@ -5,7 +5,7 @@ from requests import Response from bookstack_file_exporter.exporter.node import Node -from bookstack_file_exporter.common import util +from bookstack_file_exporter.common.util import HttpHelper log = logging.getLogger(__name__) @@ -19,10 +19,9 @@ class NodeExporter(): Returns: NodeExporter instance to handle building shelve/book/chapter/page relations. """ - def __init__(self, api_urls: Dict[str, str], headers: Dict[str,str], verify_ssl: bool): + def __init__(self, api_urls: Dict[str, str], http_client: HttpHelper): self.api_urls = api_urls - self.headers = headers - self.verify_ssl = verify_ssl + self.http_client = http_client def get_all_shelves(self) -> Dict[int, Node]: """ @@ -38,8 +37,7 @@ def get_all_shelves(self) -> Dict[int, Node]: def _get_json_response(self, url: str) -> List[Dict[str, Union[str,int]]]: """get http response data in json format""" - response: Response = util.http_get_request(url=url, headers=self.headers, - verify_ssl=self.verify_ssl) + response: Response = self.http_client.http_get_request(url=url) return response.json() def _get_all_ids(self, url: str) -> List[int]: @@ -156,7 +154,8 @@ def get_all_pages(self, book_nodes: Dict[int, Node]) -> Dict[int, Node]: # add `page` flag, we only want pages # filter out chapters for now # chapters can have their own children/pages - page_nodes: Dict[int, Node] = self.get_child_nodes("pages", book_nodes, node_type="page") + page_nodes: Dict[int, Node] = self.get_child_nodes("pages", + book_nodes, node_type="page") ## chapters (if exists) # chapter nodes are treated a little differently # chapters are children under books diff --git a/bookstack_file_exporter/exporter/node.py b/bookstack_file_exporter/exporter/node.py index d63e17f..5abe41d 100644 --- a/bookstack_file_exporter/exporter/node.py +++ b/bookstack_file_exporter/exporter/node.py @@ -1,4 +1,6 @@ from typing import Dict, Union, List +import unicodedata +from re import sub as re_sub # shelves --> 'books' # books --> 'content' @@ -34,7 +36,9 @@ def __init__(self, meta: Dict[str, Union[str, int]], self._parent = parent self._path_prefix = path_prefix # for convenience/usage for exporter - self.name: str = self.meta['slug'] + # self.name: str = self.meta['slug'] + self.name = self.get_name(self.meta['slug'], self.meta['name']) + # id() is a built-in function and should not be used as a variable name self.id_: int = self.meta['id'] self._display_name = self.meta['name'] # children @@ -42,6 +46,14 @@ def __init__(self, meta: Dict[str, Union[str, int]], # if parent self._file_path = self._get_file_path() + def get_name(self, slug: str, name: str) -> str: + """return name of resource""" + if slug: + return slug + if name != _NULL_PAGE_NAME: + return self.slugify(name) + return "" + def _get_file_path(self) -> str: if self._parent: # page node @@ -86,3 +98,23 @@ def empty(self): if not self.name and self._display_name == _NULL_PAGE_NAME: return True return False + + @staticmethod + def slugify(value: str, allow_unicode=False): + """ + Taken from https://github.com/django/django/blob/master/django/utils/text.py + Convert to ASCII if 'allow_unicode' is False. Convert spaces or repeated + dashes to single dashes. Remove characters that aren't alphanumerics, + underscores, or hyphens. Convert to lowercase. Also strip leading and + trailing whitespace, dashes, and underscores. + """ + if allow_unicode: + value = unicodedata.normalize("NFKC", value) + else: + value = ( + unicodedata.normalize("NFKD", value) + .encode("ascii", "ignore") + .decode("ascii") + ) + value = re_sub(r"[^\w\s-]", "", value.lower()) + return re_sub(r"[-\s]+", "-", value).strip("-_") diff --git a/bookstack_file_exporter/run.py b/bookstack_file_exporter/run.py index 2ad787c..e7b408e 100644 --- a/bookstack_file_exporter/run.py +++ b/bookstack_file_exporter/run.py @@ -1,37 +1,47 @@ import argparse import sys import logging +import time from typing import Dict from bookstack_file_exporter.config_helper.config_helper import ConfigNode from bookstack_file_exporter.exporter.node import Node from bookstack_file_exporter.exporter.exporter import NodeExporter from bookstack_file_exporter.archiver.archiver import Archiver +from bookstack_file_exporter.common.util import HttpHelper log = logging.getLogger(__name__) -def exporter(args: argparse.Namespace): - """export bookstack nodes and archive locally and/or remotely""" - ## get configuration from helper +def entrypoint(args: argparse.Namespace): + """entrypoint for export process""" + # get configuration from helper config = ConfigNode(args) + if config.user_inputs.run_interval: + while True: + exporter(config) + log.info("Waiting %s seconds for next run", config.user_inputs.run_interval) + # sleep process state + time.sleep(config.user_inputs.run_interval) + exporter(config) - ## convenience vars - bookstack_headers = config.headers - api_urls = config.urls - unassigned_dir = config.unassigned_book_dir - verify_ssl = config.user_inputs.assets.verify_ssl +def exporter(config: ConfigNode): + """export bookstack nodes and archive locally and/or remotely""" #### Export Data ##### # need to implement pagination for apis log.info("Beginning run") + ## Helper functions with user provided (or defaults) http config + http_client = HttpHelper(config.headers, config.user_inputs.http_config) + ## Use exporter class to get all the resources (pages, books, etc.) and their relationships log.info("Building shelve/book/chapter/page relationships") - export_helper = NodeExporter(api_urls, bookstack_headers, verify_ssl) + export_helper = NodeExporter(config.urls, http_client) ## shelves shelve_nodes: Dict[int, Node] = export_helper.get_all_shelves() ## books - book_nodes: Dict[int, Node] = export_helper.get_all_books(shelve_nodes, unassigned_dir) + book_nodes: Dict[int, Node] = export_helper.get_all_books(shelve_nodes, + config.unassigned_book_dir) ## pages page_nodes: Dict[int, Node] = export_helper.get_all_pages(book_nodes) if not page_nodes: @@ -39,7 +49,10 @@ def exporter(args: argparse.Namespace): sys.exit(0) log.info("Beginning archive") ## start archive ## - archive: Archiver = Archiver(config) + archive: Archiver = Archiver(config, http_client) + + # create export directory if not exists + archive.create_export_dir() # get all page content for each page archive.get_bookstack_exports(page_nodes) @@ -53,4 +66,5 @@ def exporter(args: argparse.Namespace): # clean up the .tgz archive since it is already uploaded archive.clean_up() + log.info("Created file archive: %s.tgz", archive.archive_dir) log.info("Completed run") diff --git a/examples/config.yml b/examples/config.yml index 2a12cb1..99481d4 100644 --- a/examples/config.yml +++ b/examples/config.yml @@ -3,20 +3,14 @@ # if you put http here, it will try verify=false, not to check certs host: "https://bookstack.mydomain.org" # You could optionally set the bookstack token_id and token_secret here instead of env -# If using env vars instead you can omit/comment out this section +# If using env vars instead you can leave values empty or omit this section credentials: - # set here or as env variable, BOOKSTACK_TOKEN_ID - # env var takes precedence over below - token_id: "" - # set here or as env variable, BOOKSTACK_TOKEN_SECRET - # env var takes precedence over below - token_secret: "" -# optional - additional headers to add, examples below -# if not required, you can omit/comment out section -additional_headers: - test: "test" - test2: "test2" - User-Agent: "test-agent" + # set here or as env variable, BOOKSTACK_TOKEN_ID + # env var takes precedence over below + token_id: "" + # set here or as env variable, BOOKSTACK_TOKEN_SECRET + # env var takes precedence over below + token_secret: "" # supported formats from bookstack below # specify one or more formats: @@ -29,6 +23,9 @@ assets: # optional export of all the images used in a page(s). # omit this or set to false if not needed export_images: false + # optional export of all the attachments used in a page(s). + # omit this or set to false if not needed + export_attachments: false # optional modify markdown files to replace image url links # with local exported image paths modify_markdown: false @@ -37,8 +34,28 @@ assets: # like: last update, owner, revision count, etc. # omit this or set to false if not needed export_meta: false - # optional whether or not to check ssl certificates when requesting content from Bookstack host - verify_ssl: true +# optional - can override default http_config +# if not required, you can omit/comment out section +# https://urllib3.readthedocs.io/en/stable/reference/urllib3.util.html +# default backoff_factor 2.5 means we wait 5, 10, 20, and then 40 seconds before our last retry +# - this should allow for per minute rate limits to be refreshed +http_config: + # whether or not to verify ssl certificates if using https + verify_ssl: false + # set http timeout in seconds for requests + timeout: 30 + # default backoff_factor 2.5 means we wait 5, 10, 20, and then 40 seconds before our last retry + # - this should allow for per minute rate limits to be refreshed + backoff_factor: 2.5 + # which status codes trigger retries + retry_codes: [413, 429, 500, 502, 503, 504] + # number of retries + retry_count: 5 + # any additional headers to add to http requests + additional_headers: + test: "test" + test2: "test2" + User-Agent: "test-agent" # directory to export to # relative or full path output_path: "bkps/" @@ -49,3 +66,8 @@ output_path: "bkps/" # set to 1+ if you want to retain a certain number of archives # set to 0 or comment out section if you want no action done keep_last: 5 +## optional - if specified exporter will run in a loop +# it will run and then pause for {run_interval} seconds before running again +# specify in seconds, example: 86400 seconds = 24 hours or run once a day +# omit/commit out or set to 0 if you just want a single run and exit +run_interval: 0 \ No newline at end of file diff --git a/examples/docker-compose.yaml b/examples/docker-compose.yaml new file mode 100644 index 0000000..23fd82a --- /dev/null +++ b/examples/docker-compose.yaml @@ -0,0 +1,18 @@ +name: bookstack-file-exporter +services: + bookstack-file-exporter: + image: homeylab/bookstack-file-exporter:latest + # use a uid/gid that has permissions to write to local dump directory + user: 1000:1000 + container_name: bookstack-file-exporter + environment: + - LOG_LEVEL=info + # example volumes shown + # change the left side of the ':' to your preferred files/dir + volumes: + - /opt/bookstack/bkps/config.yml:/export/config/config.yml:ro + - /opt/bookstack/bkps/archives:/export/dump + # can also pass env variables as a file + env_file: + - bkp.env + restart: always \ No newline at end of file diff --git a/examples/minio_config.yml b/examples/minio_config.yml index 692cacd..517fa3d 100644 --- a/examples/minio_config.yml +++ b/examples/minio_config.yml @@ -3,20 +3,14 @@ # if you put http here, it will try verify=false, not to check certs host: "https://bookstack.mydomain.org" # You could optionally set the bookstack token_id and token_secret here instead of env -# If using env vars instead you can omit/comment out this section +# If using env vars instead you can leave values empty or omit this section credentials: - # set here or as env variable, BOOKSTACK_TOKEN_ID - # env var takes precedence over below - token_id: "" - # set here or as env variable, BOOKSTACK_TOKEN_SECRET - # env var takes precedence over below - token_secret: "" -# additional headers to add, examples below -# if not required, you can omit/comment out -additional_headers: - test: "test" - test2: "test2" - User-Agent: "test-agent" + # set here or as env variable, BOOKSTACK_TOKEN_ID + # env var takes precedence over below + token_id: "" + # set here or as env variable, BOOKSTACK_TOKEN_SECRET + # env var takes precedence over below + token_secret: "" # supported formats from bookstack below # specify one or more formats: @@ -58,6 +52,9 @@ assets: # optional export of all the images used in a page(s). # omit this or set to false if not needed export_images: false + # optional export of all the attachments used in a page(s). + # omit this or set to false if not needed + export_attachments: false # optional modify markdown files to replace image url links # with local exported image paths modify_markdown: false @@ -66,17 +63,35 @@ assets: # like: last update, owner, revision count, etc. # omit this or set to false if not needed export_meta: false - # optional whether or not to check ssl certificates when requesting content from Bookstack host - verify_ssl: true -# After uploading to object storage targets, choose to clean up local files -# delete the archive from local filesystem -# optional -# default = false if omitted -clean_up: true +# optional - can override default http_config +# if not required, you can omit/comment out section +# https://urllib3.readthedocs.io/en/stable/reference/urllib3.util.html +http_config: + # whether or not to verify ssl certificates if using https + verify_ssl: false + # set http timeout in seconds for requests + timeout: 30 + # default backoff_factor 2.5 means we wait 5, 10, 20, and then 40 seconds before our last retry + # - this should allow for per minute rate limits to be refreshed + backoff_factor: 2.5 + # which status codes trigger retries + retry_codes: [413, 429, 500, 502, 503, 504] + # number of retries + retry_count: 5 + # any additional headers to add to http requests + additional_headers: + test: "test" + test2: "test2" + User-Agent: "test-agent" # optional if specified exporter can delete older archives # valid values are: # set to -1 if you want to delete all archives after each run # - this is useful if you only want to upload to object storage # set to 1+ if you want to retain a certain number of archives # set to 0 or comment out section if you want no action done -keep_last: -1 \ No newline at end of file +keep_last: -1 +## optional - if specified exporter will run in a loop +# it will run and then pause for {run_interval} seconds before running again +# specify in seconds, example: 86400 seconds = 24 hours or run once a day +# omit/commit out or set to 0 if you just want a single run and exit +run_interval: 0 \ No newline at end of file diff --git a/setup.cfg b/setup.cfg index ef4786a..4c25490 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = bookstack-file-exporter -# version will be replaced by IMAGE_TAG in Makefile +# version will be replaced by IMAGE_TAG via Github Actions version = 0.0.1 author = pchang388 # author_email = your@email.address @@ -17,10 +17,10 @@ classifiers = [options] python_requires = >=3.8 install_requires = - Pyyaml >= 6.0.1 # https://pypi.org/project/PyYAML/ - Pydantic >= 2.4.0 # https://docs.pydantic.dev/latest/ - requests >= 2.31.0 # https://pypi.org/project/requests/ - minio >= 7.2.0 # https://pypi.org/project/minio/ + Pyyaml >= 6.0.2 # https://pypi.org/project/PyYAML/ + Pydantic >= 2.10.6 # https://docs.pydantic.dev/latest/ + requests >= 2.32.3 # https://pypi.org/project/requests/ + minio >= 7.2.15 # https://pypi.org/project/minio/ packages = find: [options.entry_points]