diff --git a/.dockerignore b/.dockerignore index 21d444f..7c04879 100644 --- a/.dockerignore +++ b/.dockerignore @@ -162,6 +162,8 @@ cython_debug/ ## Local local/ +.vscode/ +.github/ ## test outputs bkps/ \ No newline at end of file diff --git a/.github/actions/docker/action.yml b/.github/actions/docker/action.yml new file mode 100644 index 0000000..3fa3194 --- /dev/null +++ b/.github/actions/docker/action.yml @@ -0,0 +1,84 @@ +--- +name: Docker image +description: Creates a Docker image + +# note inputs have some weird behavior: https://github.com/actions/runner/issues/1483 +# for string type, don't wrap in JSON +# for boolean type, wrap in JSON + +inputs: + dockerhub_username: + description: Dockerhub username + type: string + required: false + default: none + dockerhub_token: + description: Dockerhub token + type: string + required: false + default: none + push: + description: Push Images to docker hub + type: boolean + required: false + default: true + latest: + description: Update latest tag + type: boolean + required: false + default: true + +runs: + using: composite + steps: + - name: Docker meta + id: meta + uses: docker/metadata-action@v5 + with: + images: docker.io/${{ github.repository }} + flavor: | + latest=${{ fromJSON(inputs.latest) }} + # for some reason can't get this to show up from docker image labels + # placing here for now + labels: | + org.opencontainers.image.description=Page asset and content exporter for Bookstack + tags: | + ## add the event types that should be added as tags + ## on merge to master - update `main` tag for testing before release + type=ref,event=branch + ## on release - for use by users + ## version ; shorthand for {{major}}.{{minor}}.{{patch}} (can include pre-release) + type=semver,pattern={{ version }} + + - name: Set up QEMU + uses: docker/setup-qemu-action@v3 + + - name: Set up Docker Buildx + id: buildx + uses: docker/setup-buildx-action@v3 + + - name: Login to Dockerhub + if: github.event_name != 'pull_request' + uses: docker/login-action@v3 + with: + username: ${{ inputs.dockerhub_username }} + password: ${{ inputs.dockerhub_token }} + + - name: Build Docker Image + if: github.event_name != 'pull_request' + uses: docker/build-push-action@v5 + with: + context: . + file: ./Dockerfile + platforms: linux/amd64,linux/arm64 + push: ${{ fromJSON(inputs.push) }} + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + + - name: Update Dockerhub Documentation + uses: peter-evans/dockerhub-description@v3 + if: ${{ (fromJSON(inputs.latest) == true) && (github.event_name != 'pull_request') }} + with: + username: ${{ inputs.dockerhub_username }} + password: ${{ inputs.dockerhub_token }} + repository: ${{ github.repository }} \ No newline at end of file diff --git a/.github/actions/python/action.yml b/.github/actions/python/action.yml new file mode 100644 index 0000000..afef031 --- /dev/null +++ b/.github/actions/python/action.yml @@ -0,0 +1,41 @@ +--- +name: Deploy to PyPi +description: Deploys the python package to PyPi + +inputs: + pypi_api_token: + description: PyPi api token + type: string + required: true + +runs: + using: composite + steps: + - name: Get tag release without v + shell: bash + run: | + TAG=${{ github.ref_name }} + echo "VERSION=${TAG#v}" >> "$GITHUB_ENV" + echo "Tag without v is: ${VERSION}" + - name: Update Release Tag + shell: bash + run: sed -i "s/^version = [^ ]*/version = ${{ env.VERSION }}/" setup.cfg + - name: Set up Python + uses: actions/setup-python@v3 + with: + python-version: '3.12.1' + - name: Install Dependencies + shell: bash + run: | + python -m pip install --upgrade pip + pip install build + - name: Build Python Package + shell: bash + run: | + python -m pip install --upgrade build + python -m build + - name: Publish to PyPi + uses: pypa/gh-action-pypi-publish@release/v1 + with: + password: ${{ inputs.pypi_api_token }} + skip-existing: true \ No newline at end of file diff --git a/.github/actions/tests/action.yml b/.github/actions/tests/action.yml new file mode 100644 index 0000000..3606371 --- /dev/null +++ b/.github/actions/tests/action.yml @@ -0,0 +1,20 @@ +--- +name: Test Python Package +description: Test and lint code + +runs: + using: composite + steps: + - name: Set up Python + uses: actions/setup-python@v3 + with: + python-version: '3.12.1' + - name: Install dependencies + shell: bash + run: | + python -m pip install --upgrade pip + pip install pylint + - name: Analysing the code with pylint + shell: bash + run: | + pylint $(git ls-files '*.py') \ No newline at end of file diff --git a/.github/workflows/always.pylint.yml b/.github/bkp/always.pylint.yml similarity index 100% rename from .github/workflows/always.pylint.yml rename to .github/bkp/always.pylint.yml diff --git a/.github/bkp/on_pr_merged.yml b/.github/bkp/on_pr_merged.yml new file mode 100644 index 0000000..04552fb --- /dev/null +++ b/.github/bkp/on_pr_merged.yml @@ -0,0 +1,121 @@ +# needs: [tests] # require tests to pass before deploy runs + +name: Build and Push + +# on: +# push: +# # Pattern matched against refs/tags +# tags: +# - '**' # Push events to every tag including hierarchical tags like v1.0/beta + +on: + pull_request: + types: + - closed + branches: + - main + +# maybe trigger build/push on release tags? +# but this also works for my use case +jobs: + docker_deploy: + if: github.event.pull_request.merged + runs-on: ubuntu-latest + # specify this to target the correct env/secrets to use + environment: 'Dockerhub' + steps: + - uses: actions/checkout@v3 + - name: Login to Dockerhub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + - name: Set up QEMU + uses: docker/setup-qemu-action@v3 + - name: Set up Docker Buildx + id: buildx + uses: docker/setup-buildx-action@v3 + - name: Build and push the Docker image + run: make docker_build_latest + # - name: Push Docker image + # run: make docker_push_latest + - name: Update Dockerhub Documentation + uses: peter-evans/dockerhub-description@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + repository: ${{ env.DOCKERHUB_REPO }} + pypi_deploy: + if: github.event.pull_request.merged + runs-on: ubuntu-latest + environment: 'PyPi' + steps: + - uses: actions/checkout@v3 + - name: Set up Python + uses: actions/setup-python@v3 + with: + python-version: '3.x' + - name: Set tag version + run: | + TAG=$(cat Makefile | grep -E ^IMAGE_TAG=[0-9].[0-9].[0-9] | cut -d "=" -f2) + echo "VERSION=${TAG}" >> "$GITHUB_ENV" + echo "version from Makefile is: ${VERSION}" + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install build + - name: Update Release tag + run: sed -i "s/^version = [^ ]*/version = ${{ env.VERSION }}/" setup.cfg + - name: Build package + run: make build + - name: Publish package + uses: pypa/gh-action-pypi-publish@release/v1 + with: + password: ${{ secrets.PYPI_API_TOKEN }} + skip-existing: true + create_tag: + if: github.event.pull_request.merged + runs-on: ubuntu-latest + needs: + - docker_deploy + - pypi_deploy + permissions: + contents: write + steps: + - uses: actions/checkout@v3 + with: + ref: ${{ github.event.pull_request.merge_commit_sha }} + fetch-depth: '0' + - name: Set tag version + run: | + TAG=$(cat Makefile | grep -E ^IMAGE_TAG=[0-9].[0-9].[0-9] | cut -d "=" -f2) + echo "VERSION=v${TAG}" >> "$GITHUB_ENV" + echo "version from Makefile is: ${VERSION}" + - name: Create tag + uses: anothrNick/github-tag-action@1.64.0 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # WITH_V: true + # PRERELEASE: true + CUSTOM_TAG: ${{ env.VERSION }} + create_release: + if: github.event.pull_request.merged + runs-on: ubuntu-latest + needs: + - create_tag + permissions: + contents: write + steps: + - uses: actions/checkout@v3 + - name: Set tag version + run: | + TAG=$(cat Makefile | grep -E ^IMAGE_TAG=[0-9].[0-9].[0-9] | cut -d "=" -f2) + echo "VERSION=v${TAG}" >> "$GITHUB_ENV" + echo "version from Makefile is: ${VERSION}" + - name: Generate release + uses: ncipollo/release-action@v1 + with: + tag: ${{ env.VERSION }} + generateReleaseNotes: true + skipIfReleaseExists: true + # docker image tag latest diff --git a/.github/workflows/on_pr_open.docker-build.yml b/.github/bkp/on_pr_open.docker-build.yml similarity index 84% rename from .github/workflows/on_pr_open.docker-build.yml rename to .github/bkp/on_pr_open.docker-build.yml index 0f91b2e..73a9c1b 100644 --- a/.github/workflows/on_pr_open.docker-build.yml +++ b/.github/bkp/on_pr_open.docker-build.yml @@ -11,6 +11,11 @@ jobs: - uses: actions/checkout@v3 - name: Prepare Makefile run: sed -i 's/^IMAGE_TAG=[^ ]*/IMAGE_TAG=${{github.run_id}}/' Makefile + - name: Set up QEMU + uses: docker/setup-qemu-action@v3 + - name: Set up Docker Buildx + id: buildx + uses: docker/setup-buildx-action@v3 - name: Build the Docker image run: make docker_build pip_build: diff --git a/.github/workflows/on_pr_merged.yml b/.github/workflows/on_pr_merged.yml index 255b7ed..d270180 100644 --- a/.github/workflows/on_pr_merged.yml +++ b/.github/workflows/on_pr_merged.yml @@ -1,116 +1,32 @@ # needs: [tests] # require tests to pass before deploy runs -name: Build and Push - -# on: -# push: -# # Pattern matched against refs/tags -# tags: -# - '**' # Push events to every tag including hierarchical tags like v1.0/beta +name: Create Current Main Image on: - pull_request: - types: - - closed + push: branches: - main -# maybe trigger build/push on release tags? -# but this also works for my use case jobs: - docker_deploy: - if: github.event.pull_request.merged - runs-on: ubuntu-latest - # specify this to target the correct env/secrets to use - environment: 'Dockerhub' - steps: - - uses: actions/checkout@v3 - - name: Login to Dockerhub - uses: docker/login-action@v3 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_TOKEN }} - - name: Build the Docker image - run: make docker_build_latest - - name: Push Docker image - run: make docker_push_latest - - name: Update Dockerhub Documentation - uses: peter-evans/dockerhub-description@v3 - with: - username: ${{ secrets.DOCKERHUB_USERNAME }} - password: ${{ secrets.DOCKERHUB_TOKEN }} - repository: ${{ env.DOCKERHUB_REPO }} - pypi_deploy: - if: github.event.pull_request.merged + test: runs-on: ubuntu-latest - environment: 'PyPi' steps: - - uses: actions/checkout@v3 - - name: Set up Python - uses: actions/setup-python@v3 - with: - python-version: '3.x' - - name: Set tag version - run: | - TAG=$(cat Makefile | grep -E ^IMAGE_TAG=[0-9].[0-9].[0-9] | cut -d "=" -f2) - echo "VERSION=${TAG}" >> "$GITHUB_ENV" - echo "version from Makefile is: ${VERSION}" - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install build - - name: Update Release tag - run: sed -i "s/^version = [^ ]*/version = ${{ env.VERSION }}/" setup.cfg - - name: Build package - run: make build - - name: Publish package - uses: pypa/gh-action-pypi-publish@release/v1 - with: - password: ${{ secrets.PYPI_API_TOKEN }} - skip-existing: true - create_tag: - if: github.event.pull_request.merged - runs-on: ubuntu-latest - needs: - - docker_deploy - - pypi_deploy - permissions: - contents: write - steps: - - uses: actions/checkout@v3 - with: - ref: ${{ github.event.pull_request.merge_commit_sha }} - fetch-depth: '0' - - name: Set tag version - run: | - TAG=$(cat Makefile | grep -E ^IMAGE_TAG=[0-9].[0-9].[0-9] | cut -d "=" -f2) - echo "VERSION=v${TAG}" >> "$GITHUB_ENV" - echo "version from Makefile is: ${VERSION}" - - name: Create tag - uses: anothrNick/github-tag-action@1.64.0 - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - # WITH_V: true - # PRERELEASE: true - CUSTOM_TAG: ${{ env.VERSION }} - create_release: - if: github.event.pull_request.merged + - name: Checkout + uses: actions/checkout@v4 + - name: Python Tests + uses: ./.github/actions/tests + # push to `main` image for testing/most up to date + docker-build: runs-on: ubuntu-latest - needs: - - create_tag - permissions: - contents: write + needs: test + environment: 'Dockerhub' + timeout-minutes: 10 steps: - - uses: actions/checkout@v3 - - name: Set tag version - run: | - TAG=$(cat Makefile | grep -E ^IMAGE_TAG=[0-9].[0-9].[0-9] | cut -d "=" -f2) - echo "VERSION=v${TAG}" >> "$GITHUB_ENV" - echo "version from Makefile is: ${VERSION}" - - name: Generate release - uses: ncipollo/release-action@v1 + - name: Checkout + uses: actions/checkout@v4 + - name: Run Docker Build + uses: ./.github/actions/docker with: - tag: ${{ env.VERSION }} - generateReleaseNotes: true - skipIfReleaseExists: true - # docker image tag latest + latest: false + dockerhub_username: ${{ secrets.DOCKERHUB_USERNAME }} + dockerhub_token: ${{ secrets.DOCKERHUB_TOKEN }} \ No newline at end of file diff --git a/.github/workflows/on_pr_open.yml b/.github/workflows/on_pr_open.yml new file mode 100644 index 0000000..6a776c3 --- /dev/null +++ b/.github/workflows/on_pr_open.yml @@ -0,0 +1,18 @@ +name: Test + +on: + pull_request: + branches: [ "main" ] + types: + - opened + - reopened + - synchronize + +jobs: + test: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Python Tests + uses: ./.github/actions/tests \ No newline at end of file diff --git a/.github/workflows/on_release.yml b/.github/workflows/on_release.yml new file mode 100644 index 0000000..aa9aa72 --- /dev/null +++ b/.github/workflows/on_release.yml @@ -0,0 +1,62 @@ +--- +name: Create Official Release and Push Artifacts + +on: + push: + tags: + - v* + +permissions: + contents: write + +jobs: + tests: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Tests + uses: ./.github/actions/tests + create_release: + runs-on: ubuntu-latest + needs: tests + permissions: + contents: write + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Create Release + uses: ncipollo/release-action@v1 + with: + tag: ${{ github.ref_name }} + generateReleaseNotes: true + # build and push docker image + release-docker: + runs-on: ubuntu-latest + needs: + - tests + - create_release + environment: 'Dockerhub' + timeout-minutes: 10 + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Run Docker Build + uses: ./.github/actions/docker + with: + dockerhub_username: ${{ secrets.DOCKERHUB_USERNAME }} + dockerhub_token: ${{ secrets.DOCKERHUB_TOKEN }} + release-python: + runs-on: ubuntu-latest + needs: + - tests + - create_release + timeout-minutes: 20 + environment: 'PyPi' + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Deploy release to PyPi + uses: ./.github/actions/python + with: + pypi_api_token: "${{ secrets.PYPI_API_TOKEN }}" \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 707f8a5..c368f25 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,8 +1,13 @@ ARG BASE_IMAGE=python -ARG BASE_IMAGE_TAG=3.12-slim-python +ARG BASE_IMAGE_TAG=3.12.1-slim-bookworm FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} +LABEL \ + org.opencontainers.image.title="bookstack-file-exporter" \ + org.opencontainers.image.description="Page asset and content exporter for Bookstack" \ + org.opencontainers.image.source="https://github.com/homeylab/bookstack-file-exporter" + # Get security updates and clean up apt cache for smaller size RUN apt update -y && apt upgrade -y && \ apt install dumb-init && \ @@ -11,9 +16,9 @@ RUN apt update -y && apt upgrade -y && \ # create docker user RUN useradd -M -s /usr/sbin/nologin -u 33333 exporter -ARG DOCKER_WORK_DIR -ARG DOCKER_CONFIG_DIR -ARG DOCKER_EXPORT_DIR +ARG DOCKER_WORK_DIR=/export +ARG DOCKER_CONFIG_DIR=/export/config +ARG DOCKER_EXPORT_DIR=/export/dump ENV DOCKER_CONFIG_DIR=${DOCKER_CONFIG_DIR} ENV DOCKER_EXPORT_DIR=${DOCKER_EXPORT_DIR} diff --git a/Makefile b/Makefile index 08edc58..4eebcfd 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,7 @@ BASE_IMAGE_TAG=3.12-slim-bookworm IMAGE_NAME=homeylab/bookstack-file-exporter # keep this start sequence unique (IMAGE_TAG=) # github actions will use this to create a tag -IMAGE_TAG=1.0.2 +IMAGE_TAG=main DOCKER_WORK_DIR=/export DOCKER_CONFIG_DIR=/export/config DOCKER_EXPORT_DIR=/export/dump @@ -29,6 +29,8 @@ download_testpypi: docker_build: docker buildx build \ + --platform linux/amd64,linux/arm64 \ + --output "type=image,push=false" \ --build-arg BASE_IMAGE=${BASE_IMAGE} \ --build-arg BASE_IMAGE_TAG=${BASE_IMAGE_TAG} \ --build-arg DOCKER_WORK_DIR=${DOCKER_WORK_DIR} \ @@ -39,6 +41,8 @@ docker_build: docker_build_latest: docker buildx build \ + --platform linux/amd64,linux/arm64 \ + --output "type=image,push=true" \ --build-arg BASE_IMAGE=${BASE_IMAGE} \ --build-arg BASE_IMAGE_TAG=${BASE_IMAGE_TAG} \ --build-arg DOCKER_WORK_DIR=${DOCKER_WORK_DIR} \ diff --git a/README.md b/README.md index ed5d612..e118dcb 100644 --- a/README.md +++ b/README.md @@ -6,19 +6,20 @@ Table of Contents - [Use Case](#use-case) - [Using This Application](#using-this-application) - [Run via Pip](#run-via-pip) - - [Run Via Docker](#run-via-docker) + - [Run via Docker](#run-via-docker) - [Authentication](#authentication) - [Configuration](#configuration) - [Backup Behavior](#backup-behavior) - [General](#general) - [Images](#images) + - [Attachments](#attachments) - [Modify Markdown Files](#modify-markdown-files) - [Object Storage](#object-storage) - [Minio Backups](#minio-backups) - [Future Items](#future-items) ## Background -_Features are actively being developed. See `Future Items` section for more details. Open an issue for a feature request._ +_If you encounter any issues, want to request an additional feature, or provide assistance, feel free to open a Github issue._ This tool provides a way to export [Bookstack](https://github.com/BookStackApp/BookStack) pages and their content (_text, images, metadata, etc._) into a relational parent-child layout locally with an option to push to remote object storage locations. See [Backup Behavior](#backup-behavior) section for more details on how pages are organized. @@ -29,14 +30,13 @@ What it does: - Discover and build relationships between Bookstack `Shelves/Books/Chapters/Pages` to create a relational parent-child layout - Export Bookstack pages and their content to a `.tgz` archive -- Additional content for pages like their images and metadata and can be exported -- The exporter can also [Modify Markdown Files](#modify-markdown-files) to replace image links with local exported image paths for a more portable backup +- Additional content for pages like their images, attachments, and metadata and can be exported +- The exporter can also [Modify Markdown Files](#modify-markdown-files) to replace image and/or attachment links with local exported paths for a more portable backup - YAML configuration file for repeatable and easy runs - Can be run via [Python](#run-via-pip) or [Docker](#run-via-docker) - Can push archives to remote object storage like [Minio](https://min.io/) - Basic housekeeping option (`keep_last`) to keep a tidy archive destination - Supported backup targets are: 1. local @@ -57,7 +57,7 @@ The main use case is to backup all docs in a relational directory-tree format to 2. Offline copy wanted. 3. Back up at a file level as an accessory or alternative to disk and volume backups. 4. Migrate all Bookstack page contents to Markdown documenting for simplicity. -5. Provide an easy way to do automated file backups locally, in docker, or kubernetes for Bookstack page contents. +5. Provide an easy way to do automated file backups locally, in docker, or [kubernetes](https://github.com/homeylab/helm-charts/tree/main/charts/bookstack#file-exporter-backup-your-pages) for Bookstack page contents. ## Using This Application Ensure a valid configuration is provided when running this application. See [Configuration](#Configuration) section for more details. @@ -77,6 +77,7 @@ formats: # md only example output_path: "bkps/" assets: export_images: false + export_attachments: false modify_markdown: false export_meta: false verify_ssl: true @@ -85,10 +86,16 @@ assets: ### Run via Pip The exporter can be installed via pip and run directly. +#### Python Version +_Note: This application is tested and developed on Python version `3.12.X`. The min required version is >= `3.8` but is recommended to install (or set up a venv) a `3.12.X` version._ + #### Examples ```bash python -m pip install bookstack-file-exporter +# if you prefer a specific version, example: +python -m pip install bookstack-file-exporter==X.X.X + # using pip python -m bookstack_file_exporter -c @@ -114,11 +121,17 @@ export LOG_LEVEL=debug python -m bookstack_file_exporter -c ``` -#### Python Version -_Note: This application is tested and developed on Python version `3.12.X`. The min required version is >= `3.8` but is recommended to install (or set up a venv) a `3.12.X` version._ +### Run via Docker +Docker images are provided for `linux/amd64` and `linux/arm64` variants only at the moment. If another variant is required, please request it via Github Issue. + +#### Tags +Users will generally want to use the `latest` tag or a specific version tag. The `main` tag is also provided but is not guaranteed to be stable. -### Run Via Docker -Docker can be utilized to run the exporter. +| tag | description | +| --- | ----------- | +| `latest` | Latest stable release and is updated with each new stable release. | +| `X.X.X` | Semantic versioned releases are also provided if preferred for stability or other reasons. | +| `main` | This tag reflects the `main` branch of this repository and may not be stable | #### Examples ```bash @@ -137,7 +150,6 @@ docker run \ homeylab/bookstack-file-exporter:latest ``` - #### Environment Variables See [Valid Environment Variables](#valid-environment-variables) for more options. @@ -211,6 +223,7 @@ minio: output_path: "bkps/" assets: export_images: true + export_attachments: true modify_markdown: false export_meta: false verify_ssl: true @@ -231,6 +244,7 @@ More descriptions can be found for each section below: | `output_path` | `str` | `false` | Optional (default: `cwd`) which directory (relative or full path) to place exports. User who runs the command should have access to read/write to this directory. If not provided, will use current run directory by default | | `assets` | `object` | `false` | Optional section to export additional assets from pages. | | `assets.export_images` | `bool` | `false` | Optional (default: `false`), export all images for a page to an `image` directory within page directory. See [Backup Behavior](#backup-behavior) for more information on layout | +| `assets.export_attachments` | `bool` | `false` | Optional (default: `false`), export all attachments for a page to an `attachments` directory within page directory. See [Backup Behavior](#backup-behavior) for more information on layout | | `assets.modify_markdown` | `bool` | `false` | Optional (default: `false`), modify markdown files to replace image links with local exported image paths. This requires `assets.export_images` to be `true` in order to work. See [Modify Markdown Files](#modify-markdown-files) for more information. | `assets.export_meta` | `bool` | `false` | Optional (default: `false`), export of metadata about the page in a json file | | `assets.verify_ssl` | `bool` | `false` | Optional (default: `true`), whether or not to check ssl certificates when requesting content from Bookstack host | @@ -239,7 +253,7 @@ More descriptions can be found for each section below: #### Valid Environment Variables General -- `LOG_LEVEL`: default: `info``. Provide a valid log level: info, debug, warning, error. +- `LOG_LEVEL`: default: `info`. Provide a valid log level: info, debug, warning, error. [Bookstack Credentials](#authentication) - `BOOKSTACK_TOKEN_ID` @@ -256,8 +270,12 @@ Backups are exported in `.tgz` format and generated based off timestamp. Export The exporter can also do housekeeping duties and keep a configured number of archives and delete older ones. See `keep_last` property in the [Configuration](#options-and-descriptions) section. Object storage provider configurations include their own `keep_last` property for flexibility. -For file names, `slug` names (from Bookstack API) are used, as such certain characters like `!`, `/` will be ignored and spaces replaced from page names/titles. +#### File Naming +For file names, `slug` names (from Bookstack API) are used, as such certain characters like `!`, `/` will be ignored and spaces replaced from page names/titles. If your page has an empty `slug` value for some reason (draft that was never fully saved), the exporter will use page name with the `slugify` function from Django to generate a valid slug. Example: `My Page.bin Name!` will be converted to `my-page-bin-name`. +You may also notice some directories (books) and/or files (pages) in the archive have a random string at the end, example - `nKA`: `user-and-group-management-nKA`. This is expected and is because there were resources with the same name created in another shelve and bookstack adds a string at the end to ensure uniqueness. + +#### Directory Layout All sub directories will be created as required during the export process. ``` Shelves --> Books --> Chapters --> Pages @@ -289,7 +307,7 @@ kafka-apps (shelf) ---> settings.md (page) ... -## Example with image layout +## Example with image and attachment layout # unassigned dir is used for books with no shelf unassigned (shelf) ---> test (book) @@ -300,12 +318,20 @@ unassigned (shelf) ---> rec-page ---> img-010.png ---> img-020.png + --> attachments (attachment_dir) + ---> test_page (page directory) + ---> something.config + ---> something_else.config + ---> rec-page + ---> test_output.log + ---> actual_output.log ---> test_page.md (page) ... ---> rec_page (page) ---> rec_page.md ---> rec_page.pdf ``` + Another example is shown below: ``` ## First example: @@ -320,13 +346,18 @@ bookstack_export_2023-11-28_06-24-25/programming/react/images/basics/dwwimage.pn bookstack_export_2023-11-28_06-24-25/programming/react/images/basics/NzZimage.png bookstack_export_2023-11-28_06-24-25/programming/react/images/nextjs/next1.png bookstack_export_2023-11-28_06-24-25/programming/react/images/nextjs/tips.png +bookstack_export_2023-11-28_06-24-25/programming/react/attachments/nextjs/sample.config +bookstack_export_2023-11-28_06-24-25/programming/react/attachments/nextjs/sample_output.log bookstack_export_2023-11-28_06-24-25/programming/react/nextjs.md bookstack_export_2023-11-28_06-24-25/programming/react/nextjs.pdf ``` Books without a shelf will be put in a shelve folder named `unassigned`. -Empty/New Pages will be ignored since they have not been modified yet from creation and are empty but also do not have a valid slug. Example: +#### Empty/New Pages +Empty/New Pages will be ignored since they have not been modified yet from creation and are empty but also do not have a valid slug. + +Example from Bookstack API: ``` { ... @@ -336,10 +367,7 @@ Empty/New Pages will be ignored since they have not been modified yet from creat } ``` -You may notice some directories (books) and/or files (pages) in the archive have a random string at the end, example - `nKA`: `user-and-group-management-nKA`. This is expected and is because there were resources with the same name created in another shelve and bookstack adds a string at the end to ensure uniqueness. - ### Images - Images will be dumped in a separate directory, `images` within the page parent (book/chapter) directory it belongs to. The relative path will be `{parent}/images/{page}/{image_name}`. As shown earlier: ``` @@ -351,22 +379,37 @@ bookstack_export_2023-11-28_06-24-25/programming/react/images/nextjs/tips.png **Note you may see old images in your exports. This is because, by default, Bookstack retains images/drawings that are uploaded even if no longer referenced on an active page. Admins can run `Cleanup Images` in the Maintenance Settings or via [CLI](https://www.bookstackapp.com/docs/admin/commands/#cleanup-unused-images) to remove them.** +### Attachments +Attachments will be dumped in a separate directory, `attachments` within the page parent (book/chapter) directory it belongs to. The relative path will be `{parent}/attachments/{page}/{attachment_name}`. As shown earlier: + +``` +bookstack_export_2023-11-28_06-24-25/programming/react/attachments/nextjs/sample.config +bookstack_export_2023-11-28_06-24-25/programming/react/attachments/nextjs/sample_package.json +... +... +``` + +**Note attachments that are just external links are ignored. Only attachments that are shown as `external: False` will be exported.** + +[Reference](https://demo.bookstackapp.com/api/docs#attachments-list) and excerpt from Bookstack API docs: +> Get a listing of attachments visible to the user. The external property indicates whether the attachment is simple a link. A false value for the external property would indicate a file upload. + ### Modify Markdown Files -**To use this feature, `assets.export_images` should be set to `true`** +**To use this feature, `assets.export_images` should be set to `true` and/or `assets.export_attachments`** -The configuration item, `assets.modify_markdown`, can be set to `true` to modify markdown files to replace image url links with local exported image paths. This feature allows for you to make your `markdown` exports much more portable. +The configuration item, `assets.modify_markdown`, can be set to `true` to modify markdown files to replace image and attachment url links with local exported image paths. This feature allows for you to make your `markdown` exports much more portable. -Page (parent) -> Images (Children) relationships are created and then each image url is replaced with its own respective local export path. Example: +Page (parent) -> Images (Children) relationships are created and then each image/attachment url is replaced with its own respective local export path. Example: ``` ## before [![pool-topology-1.png](https://demo.bookstack/uploads/images/gallery/2023-07/scaled-1680-/pool-topology-1.png)](https://demo.bookstack/uploads/images/gallery/2023-07/pool-topology-1.png) ## after -[![pool-topology-1.png](./images/{page_name}/pool-topology-1.png)](https://demo.bookstack/uploads/images/gallery/2023-07/pool-topology-1.png) +[![pool-topology-1.png](images/{page_name}/pool-topology-1.png)](https://demo.bookstack/uploads/images/gallery/2023-07/pool-topology-1.png) ``` -This allows the image to be found locally within the export files and allow your `markdown` docs to have all the images display properly like it would normally would. +This allows the image or attachment to be found locally within the export files and allow your `markdown` docs to have all the assets display properly like it would normally would. -**Note: This will work properly if your pages are using the notation used by Bookstack for Markdown image links, example: ` [![image alt text](Bookstack Markdown image URL link)](anchor/url link)` The `(anchor/url link)` is optional.** +**Note: This will work properly if your pages are using the notation used by Bookstack for Markdown image links, example: ` [![image alt text](Bookstack Markdown image URL link)](anchor/url link)` The `(anchor/url link)` is optional. For attachments the format is: `[file](url link)`** ## Object Storage Optionally, target(s) can be specified to upload generated archives to a remote location. Supported object storage providers can be found below: @@ -408,7 +451,7 @@ minio: 1. ~~Be able to pull images locally and place in their respective page folders for a more complete file level backup.~~ 2. ~~Include the exporter in a maintained helm chart as an optional deployment. The helm chart is [here](https://github.com/homeylab/helm-charts/tree/main/charts/bookstack).~~ 3. ~~Be able to modify markdown links of images to local exported images in their respective page folders for a more complete file level backup.~~ -4. Be able to pull attachments locally and place in their respective page folders for a more complete file level backup. +4. ~~Be able to pull attachments locally and place in their respective page folders for a more complete file level backup.~~ 5. Export S3 and more options. 6. Filter shelves and books by name - for more targeted backups. Example: you only want to share a book about one topic with an external friend/user. 7. Be able to pull media/photos from 3rd party providers like `drawio` \ No newline at end of file diff --git a/bookstack_file_exporter/archiver/archiver.py b/bookstack_file_exporter/archiver/archiver.py index c6d01c5..6af2024 100644 --- a/bookstack_file_exporter/archiver/archiver.py +++ b/bookstack_file_exporter/archiver/archiver.py @@ -5,7 +5,7 @@ from bookstack_file_exporter.exporter.node import Node from bookstack_file_exporter.archiver import util -from bookstack_file_exporter.archiver.page_archiver import PageArchiver, ImageNode +from bookstack_file_exporter.archiver.page_archiver import PageArchiver from bookstack_file_exporter.archiver.minio_archiver import MinioArchiver from bookstack_file_exporter.config_helper.remote import StorageProviderConfig from bookstack_file_exporter.config_helper.config_helper import ConfigNode @@ -17,7 +17,7 @@ # pylint: disable=too-many-instance-attributes class Archiver: """ - Archiver pulls all the necessary files from upstream + Archiver helps handle archive duties: pulls all the necessary files from upstream and then pushes them to the specified backup location(s) Args: @@ -32,41 +32,15 @@ def __init__(self, config: ConfigNode): # for convenience self.base_dir = config.base_dir_name self.archive_dir = self._generate_root_folder(self.base_dir) - self._page_archiver = self._generate_page_archiver() + self._page_archiver = PageArchiver(self.archive_dir, self.config) self._remote_exports = {'minio': self._archive_minio, 's3': self._archive_s3} - def get_bookstack_exports(self, page_nodes: Dict[int, Node]): """export all page content""" log.info("Exporting all bookstack page contents") # get images first if requested # this is because we may want to manipulate page data with modify_markdown flag - all_image_meta = self._get_page_image_map() - for _, page in page_nodes.items(): - page_image_meta = [] - if page.id_ in all_image_meta: - page_image_meta = all_image_meta[page.id_] - self._get_page_files(page, page_image_meta) - self._get_page_images(page, page_image_meta) - - def _get_page_files(self, page_node: Node, image_meta: List[ImageNode]): - """pull all bookstack pages into local files/tar""" - log.debug("Exporting bookstack page data") - self._page_archiver.archive_page(page_node, image_meta) - - def _get_page_image_map(self) -> Dict[int, ImageNode]: - if not self._page_archiver.export_images: - log.debug("skipping image export based on user input") - return {} - return self._page_archiver.get_image_meta() - - def _get_page_images(self, page_node: Node, img_nodes: List[ImageNode]): - if not img_nodes: - log.debug("page has no images to pull") - return - log.debug("Exporting bookstack page images") - self._page_archiver.archive_page_images(page_node.parent.file_path, - page_node.name, img_nodes) + self._page_archiver.archive_pages(page_nodes) def create_archive(self): """create tgz archive""" @@ -145,10 +119,6 @@ def _delete_files(self, file_list: List[str]): for file in file_list: util.remove_file(file) - def _generate_page_archiver(self)-> PageArchiver: - return PageArchiver(self.archive_dir, self.config) - - @staticmethod def _generate_root_folder(base_folder_name: str) -> str: """return base archive name""" diff --git a/bookstack_file_exporter/archiver/asset_archiver.py b/bookstack_file_exporter/archiver/asset_archiver.py new file mode 100644 index 0000000..71f2583 --- /dev/null +++ b/bookstack_file_exporter/archiver/asset_archiver.py @@ -0,0 +1,205 @@ +import logging +import base64 +from typing import Union, List, Dict +from re import sub as re_sub +# pylint: disable=import-error +from requests import Response + +from bookstack_file_exporter.common import util as common_util + +log = logging.getLogger(__name__) + +_IMAGE_DIR_NAME = "images" +_ATTACHMENT_DIR_NAME = "attachments" + + +class AssetNode: + """ + Base class for other asset nodes. This class should not be used directly. + + Args: + :meta_data: = asset meta data + + Returns: + AssetNode instance for use in other classes + """ + def __init__(self, meta_data: Dict[str, int | str | bool]): + self.id: int = meta_data['id'] + self.page_id: int = meta_data['uploaded_to'] + self.url: str = "" + self.name: str = "" + self._markdown_str = "" + self._relative_path_prefix: str = "" + + def get_relative_path(self, page_name: str) -> str: + """image path local to page directory""" + return f"{self._relative_path_prefix}/{page_name}/{self.name}" + + @property + def markdown_str(self): + """return markdown url str to replace""" + return self._markdown_str + + def set_markdown_content(self, asset_data: Dict[str, int | str | bool]) -> None: + """set markdown url str to replace""" + self._markdown_str = self._get_md_url_str(asset_data) + + @staticmethod + def _get_md_url_str(asset_data: Dict[str, Union[int, str]]) -> str: + url_str = "" + if 'content' in asset_data: + if 'markdown' in asset_data['content']: + url_str = asset_data['content']['markdown'] + # check to see if empty before doing find + if not url_str: + return "" + # find the link between two parenthesis + # - markdown format + return url_str[url_str.find("(")+1:url_str.find(")")] + +class ImageNode(AssetNode): + """ + ImageNode handles image meta data and markdown url replacement. + + Args: + :meta_data: = image meta data + + Returns: + ImageNode instance for use in archiving images for a page + """ + def __init__(self, meta_data: Dict[str, Union[int, str]]): + super().__init__(meta_data) + self.url: str = meta_data['url'] + self.name: str = self.url.split('/')[-1] + log.debug("Image node has generated url: %s", self.url) + self._relative_path_prefix = f"{_IMAGE_DIR_NAME}" + +class AttachmentNode(AssetNode): + """ + AttachmentNode handles attachment meta data and markdown url replacement. + + Args: + :meta_data: = attachment meta data + :base_url: = base url for attachment download + + Returns: + AttachmentNode instance for use in archiving attachments for a page + """ + def __init__(self, meta_data: Dict[str, Union[int, str, bool]], + base_url: str): + super().__init__(meta_data) + self.url: str = f"{base_url}/{self.id}" + self.name = meta_data['name'] + log.debug("Attachment node has generated url: %s", self.url) + self._relative_path_prefix = f"{_ATTACHMENT_DIR_NAME}" + + @staticmethod + def _get_md_url_str(asset_data: Dict[str, int | str | dict]) -> str: + url_str = "" + if 'links' in asset_data: + if 'markdown' in asset_data['links']: + url_str = asset_data['links']['markdown'] + # check to see if empty before doing find + if not url_str: + return "" + # find the link between two parenthesis + # - markdown format + return url_str[url_str.find("(")+1:url_str.find(")")] + +class AssetArchiver: + """ + AssetArchiver handles image and attachment exports for a page. + + Args: + :urls: = api urls for images and attachments + :headers: = http headers for api requests + :verify_ssl: = verify ssl for api requests + + Returns: + AssetArchiver instance for use in archiving images and attachments for a page + """ + def __init__(self, urls: Dict[str, str], headers: Dict[str, str], + verify_ssl: bool): + self.api_urls = urls + self.verify_ssl = verify_ssl + self._headers = headers + self._asset_map = { + 'images': self._create_image_map, + 'attachments': self._create_attachment_map + } + + def get_asset_nodes(self, asset_type: str) -> Dict[str, ImageNode | AttachmentNode]: + """Get image or attachment helpers for a page""" + asset_response: Response = common_util.http_get_request( + self.api_urls[asset_type], + self._headers, + self.verify_ssl) + asset_json = asset_response.json()['data'] + return self._asset_map[asset_type](asset_json) + + def get_asset_data(self, asset_type: str, + meta_data: Union[AttachmentNode, ImageNode]) -> Dict[str, str | bool | int | dict]: + """Get asset data based on type""" + data_url = f"{self.api_urls[asset_type]}/{meta_data.id}" + asset_data_response: Response = common_util.http_get_request( + data_url, + self._headers, + self.verify_ssl) + return asset_data_response.json() + + def get_asset_bytes(self, asset_type: str, url: str) -> bytes: + """Get raw asset data""" + asset_response: Response = common_util.http_get_request( + url, + self._headers, + self.verify_ssl) + match asset_type: + case "images": + asset_data = asset_response.content + case "attachments": + asset_data = self._decode_attachment_data(asset_response.json()['content']) + return asset_data + + def update_asset_links(self, asset_type, page_name: str, page_data: bytes, + asset_nodes: List[ImageNode | AttachmentNode]) -> bytes: + """update markdown links in page data""" + for asset_node in asset_nodes: + asset_data = self.get_asset_data(asset_type, asset_node) + asset_node.set_markdown_content(asset_data) + if not asset_node.markdown_str: + continue + page_data = re_sub(asset_node.markdown_str.encode(), + asset_node.get_relative_path(page_name).encode(), page_data) + return page_data + + @staticmethod + def _create_image_map(json_data: Dict[str, + List[Dict[str, str | int | bool | dict]]]) -> Dict[int, List[ImageNode]]: + image_page_map = {} + for img_meta in json_data: + img_node = ImageNode(img_meta) + if img_node.page_id in image_page_map: + image_page_map[img_node.page_id].append(img_node) + else: + image_page_map[img_node.page_id] = [img_node] + return image_page_map + + def _create_attachment_map(self, + json_data: Dict[str, List[Dict[str, str | int | bool | dict]]]) -> List[AssetNode]: + asset_nodes = {} + for asset_meta in json_data: + asset_node = None + if asset_meta['external']: + continue # skip external link, only get attachments + asset_node = AttachmentNode(asset_meta, self.api_urls['attachments']) + if asset_node.page_id in asset_nodes: + asset_nodes[asset_node.page_id].append(asset_node) + else: + asset_nodes[asset_node.page_id] = [asset_node] + return asset_nodes + + @staticmethod + def _decode_attachment_data(b64encoded_data: str) -> bytes: + """decode base64 encoded data""" + asset_data = b64encoded_data.encode() + return base64.b64decode(asset_data) diff --git a/bookstack_file_exporter/archiver/page_archiver.py b/bookstack_file_exporter/archiver/page_archiver.py index 812da99..4398e27 100644 --- a/bookstack_file_exporter/archiver/page_archiver.py +++ b/bookstack_file_exporter/archiver/page_archiver.py @@ -1,12 +1,9 @@ from typing import Union, List, Dict -import re -# pylint: disable=import-error -from requests import Response from bookstack_file_exporter.exporter.node import Node from bookstack_file_exporter.archiver import util as archiver_util +from bookstack_file_exporter.archiver.asset_archiver import AssetArchiver, ImageNode, AttachmentNode from bookstack_file_exporter.config_helper.config_helper import ConfigNode -from bookstack_file_exporter.common import util as common_util _META_FILE_SUFFIX = "_meta.json" _TAR_SUFFIX = ".tar" @@ -24,61 +21,11 @@ "tgz": _TAR_GZ_SUFFIX } -_IMAGE_DIR_NAME = "images" -_MARKDOWN_STR_CHECK = "markdown" - -class ImageNode: - """ - ImageNode provides metadata and convenience for Bookstack images. - - Args: - :img_meta_data: = image meta data - - Returns: - :ImageNode: instance with attributes to help handle images. - """ - def __init__(self, img_meta_data: Dict[str, Union[int, str]]): - self.id: int = img_meta_data['id'] - self.page_id: int = img_meta_data['uploaded_to'] - self.url: str = img_meta_data['url'] - self.name: str = self._get_image_name() - self._markdown_str = "" - self._relative_path_prefix: str = f"./{_IMAGE_DIR_NAME}" - - def _get_image_name(self) -> str: - return self.url.split('/')[-1] - - def get_image_relative_path(self, page_name: str) -> str: - """return image path local to page directory""" - return f"{self._relative_path_prefix}/{page_name}/{self.name}" - - def set_markdown_content(self, img_details: Dict[str, Union[int, str]]): - """provide image metadata to set markdown properties""" - self._markdown_str = self._get_md_url_str(img_details) - - @property - def markdown_str(self): - """return markdown url str to replace""" - return self._markdown_str - - @staticmethod - def _get_md_url_str(img_data: Dict[str, Union[int, str]]) -> str: - url_str = "" - if 'content' in img_data: - if _MARKDOWN_STR_CHECK in img_data['content']: - url_str = img_data['content'][_MARKDOWN_STR_CHECK] - # check to see if empty before doing find - if not url_str: - return "" - # find the link between two parenthesis - # - markdown format - return url_str[url_str.find("(")+1:url_str.find(")")] - # pylint: disable=too-many-instance-attributes class PageArchiver: """ PageArchiver handles all data extraction and modifications - to Bookstack page contents including images. + to Bookstack page contents including assets like images or attachments. Args: :archive_dir: = directory where data will be put into. @@ -100,32 +47,52 @@ def __init__(self, archive_dir: str, config: ConfigNode) -> None: # name of the base folder to use within the tgz archive (internal tar layout) self.archive_base_path = archive_dir.split("/")[-1] self.modify_md: bool = self._check_md_modify() + self.asset_archiver = AssetArchiver(self.api_urls, self._headers, + self.verify_ssl) def _check_md_modify(self) -> bool: # check to ensure they have asset_config defined, could be None - if _MARKDOWN_STR_CHECK in self.export_formats: - return self.asset_config.modify_markdown and self.export_images + if 'markdown' in self.export_formats: + return self.asset_config.modify_markdown and \ + ( self.export_images or self.export_attachments) return False - def archive_page(self, page: Node, - image_urls: List[str] = None): - """export page content""" - for export_format in self.export_formats: - page_data = self._get_page_data(page.id_, export_format) - self._archive_page(page, export_format, - page_data, image_urls) - if self.asset_config.export_meta: - self._archive_page_meta(page.file_path, page.meta) - - def _archive_page(self, page: Node, export_format: str, data: bytes, - image_nodes: List[ImageNode] = None): + def archive_pages(self, page_nodes: Dict[int, Node]): + """export page contents and their images/attachments""" + # get assets first if requested + # this is because we may want to manipulate page data with modify_markdown flag + image_nodes = self._get_image_meta() + attachment_nodes = self._get_attachment_meta() + for _, page in page_nodes.items(): + page_images = [] + page_attachments = [] + if page.id_ in image_nodes: + page_images = image_nodes[page.id_] + if page.id_ in attachment_nodes: + page_attachments = attachment_nodes[page.id_] + for export_format in self.export_formats: + page_data = self._get_page_data(page.id_, export_format) + if page_images and export_format == 'markdown': + page_data = self._modify_markdown("images", page.name, + page_data, page_images) + if page_attachments and export_format == 'markdown': + page_data = self._modify_markdown("attachments", page.name, + page_data, page_attachments) + self._archive_page(page, export_format, + page_data) + self.archive_page_assets("images", page.parent.file_path, + page.name, page_images) + self.archive_page_assets("attachments", page.parent.file_path, + page.name, page_attachments) + if self.asset_config.export_meta: + self._archive_page_meta(page.file_path, page.meta) + + def _archive_page(self, page: Node, export_format: str, data: bytes): page_file_name = f"{self.archive_base_path}/" \ f"{page.file_path}{_FILE_EXTENSION_MAP[export_format]}" - if self.modify_md and export_format == _MARKDOWN_STR_CHECK and image_nodes: - data = self._update_image_links(page.name, data, image_nodes) self.write_data(page_file_name, data) - def _get_page_data(self, page_id: int, export_format: str): + def _get_page_data(self, page_id: int, export_format: str) -> bytes: url = f"{self.api_urls['pages']}/{page_id}/{_EXPORT_API_PATH}/{export_format}" return archiver_util.get_byte_response(url=url, headers=self._headers, verify_ssl=self.verify_ssl) @@ -135,24 +102,36 @@ def _archive_page_meta(self, page_path: str, meta_data: Dict[str, Union[str, int bytes_meta = archiver_util.get_json_bytes(meta_data) self.write_data(file_path=meta_file_name, data=bytes_meta) - def get_image_meta(self) -> Dict[int, List[ImageNode]]: + def _get_image_meta(self) -> Dict[int, List[ImageNode]]: """Get all image metadata into a {page_number: [image_url]} format""" - img_meta_response: Response = common_util.http_get_request( - self.api_urls['images'], - self._headers, - self.verify_ssl) - img_meta_json = img_meta_response.json()['data'] - return self._create_image_map(img_meta_json) - - def archive_page_images(self, parent_path: str, page_name: str, - image_nodes: List[ImageNode]): + if not self.asset_config.export_images: + return {} + return self.asset_archiver.get_asset_nodes('images') + + def _get_attachment_meta(self) -> Dict[int, List[AttachmentNode]]: + """Get all attachment metadata into a {page_number: [attachment_url]} format""" + if not self.asset_config.export_attachments: + return {} + return self.asset_archiver.get_asset_nodes('attachments') + + def _modify_markdown(self, asset_type: str, + page_name: str, page_data: bytes, + asset_nodes: List[ImageNode | AttachmentNode]) -> bytes: + if not self.modify_md: + return page_data + return self.asset_archiver.update_asset_links(asset_type, page_name, page_data, + asset_nodes) + + def archive_page_assets(self, asset_type: str, parent_path: str, page_name: str, + asset_nodes: List[ImageNode | AttachmentNode]): """pull images locally into a directory based on page""" - image_base_path = f"{self.archive_base_path}/{parent_path}/{_IMAGE_DIR_NAME}" - for img_node in image_nodes: - img_data: bytes = archiver_util.get_byte_response(img_node.url, self._headers, - self.verify_ssl) - image_path = f"{image_base_path}/{page_name}/{img_node.name}" - self.write_data(image_path, img_data) + if not asset_nodes: + return + node_base_path = f"{self.archive_base_path}/{parent_path}/" + for asset_node in asset_nodes: + asset_data = self.asset_archiver.get_asset_bytes(asset_type, asset_node.url) + asset_path = f"{node_base_path}/{asset_node.get_relative_path(page_name)}" + self.write_data(asset_path, asset_data) def write_data(self, file_path: str, data: bytes): """write data to a tar file @@ -167,21 +146,6 @@ def gzip_archive(self): """provide the tar to gzip and the name of the gzip output file""" archiver_util.create_gzip(self.tar_file, self.archive_file) - def _update_image_links(self, page_name: str, page_data: bytes, - image_nodes: List[ImageNode]) -> bytes: - """regex replace links to local created directories""" - for img_node in image_nodes: - img_meta_url = f"{self.api_urls['images']}/{img_node.id}" - img_details = common_util.http_get_request(img_meta_url, - self._headers, self.verify_ssl) - img_node.set_markdown_content(img_details.json()) - if not img_node.markdown_str: - continue - # 1 - what to replace, 2 - replace with, 3 is the data to replace - page_data = re.sub(img_node.markdown_str.encode(), - img_node.get_image_relative_path(page_name).encode(), page_data) - return page_data - @property def file_extension_map(self) -> Dict[str, str]: """file extension metadata""" @@ -192,23 +156,12 @@ def export_images(self) -> bool: """return whether or not to export images""" return self.asset_config.export_images + @property + def export_attachments(self) -> bool: + """return whether or not to export attachments""" + return self.asset_config.export_attachments + @property def verify_ssl(self) -> bool: """return whether or not to verify ssl for http requests""" return self.asset_config.verify_ssl - - # @staticmethod - # def _get_regex_expr(url: str) -> bytes: - # # regex_str = fr"\[\!\[^$|.*\]\({url}\)\]" - # return re.compile(regex_str.encode()) - - @staticmethod - def _create_image_map(json_data: List[Dict[str, Union[str,int]]]) -> Dict[int, List[ImageNode]]: - image_page_map = {} - for img_meta in json_data: - img_node = ImageNode(img_meta) - if img_node.page_id in image_page_map: - image_page_map[img_node.page_id].append(img_node) - else: - image_page_map[img_node.page_id] = [img_node] - return image_page_map diff --git a/bookstack_file_exporter/config_helper/config_helper.py b/bookstack_file_exporter/config_helper/config_helper.py index 46911b2..80b822f 100644 --- a/bookstack_file_exporter/config_helper/config_helper.py +++ b/bookstack_file_exporter/config_helper/config_helper.py @@ -19,7 +19,8 @@ "books": "api/books", "chapters": "api/chapters", "pages": "api/pages", - "images": "api/image-gallery" + "images": "api/image-gallery", + "attachments": "api/attachments" } _UNASSIGNED_BOOKS_DIR = "unassigned/" diff --git a/bookstack_file_exporter/config_helper/models.py b/bookstack_file_exporter/config_helper/models.py index 8b37b3d..e82e5da 100644 --- a/bookstack_file_exporter/config_helper/models.py +++ b/bookstack_file_exporter/config_helper/models.py @@ -23,6 +23,7 @@ class BookstackAccess(BaseModel): class Assets(BaseModel): """YAML schema for bookstack markdown asset(pages/images/attachments) configuration""" export_images: Optional[bool] = False + export_attachments: Optional[bool] = False modify_markdown: Optional[bool] = False export_meta: Optional[bool] = False verify_ssl: Optional[bool] = True diff --git a/bookstack_file_exporter/exporter/exporter.py b/bookstack_file_exporter/exporter/exporter.py index 910c30c..52702f2 100644 --- a/bookstack_file_exporter/exporter/exporter.py +++ b/bookstack_file_exporter/exporter/exporter.py @@ -156,7 +156,8 @@ def get_all_pages(self, book_nodes: Dict[int, Node]) -> Dict[int, Node]: # add `page` flag, we only want pages # filter out chapters for now # chapters can have their own children/pages - page_nodes: Dict[int, Node] = self.get_child_nodes("pages", book_nodes, node_type="page") + page_nodes: Dict[int, Node] = self.get_child_nodes("pages", + book_nodes, node_type="page") ## chapters (if exists) # chapter nodes are treated a little differently # chapters are children under books diff --git a/bookstack_file_exporter/exporter/node.py b/bookstack_file_exporter/exporter/node.py index d63e17f..6fa5a86 100644 --- a/bookstack_file_exporter/exporter/node.py +++ b/bookstack_file_exporter/exporter/node.py @@ -1,4 +1,6 @@ from typing import Dict, Union, List +import unicodedata +from re import sub as re_sub # shelves --> 'books' # books --> 'content' @@ -34,7 +36,8 @@ def __init__(self, meta: Dict[str, Union[str, int]], self._parent = parent self._path_prefix = path_prefix # for convenience/usage for exporter - self.name: str = self.meta['slug'] + # self.name: str = self.meta['slug'] + self.name = self.get_name(self.meta['slug'], self.meta['name']) self.id_: int = self.meta['id'] self._display_name = self.meta['name'] # children @@ -42,6 +45,14 @@ def __init__(self, meta: Dict[str, Union[str, int]], # if parent self._file_path = self._get_file_path() + def get_name(self, slug: str, name: str) -> str: + """return name of resource""" + if slug: + return slug + if name != _NULL_PAGE_NAME: + return self.slugify(name) + return "" + def _get_file_path(self) -> str: if self._parent: # page node @@ -86,3 +97,23 @@ def empty(self): if not self.name and self._display_name == _NULL_PAGE_NAME: return True return False + + @staticmethod + def slugify(value: str, allow_unicode=False): + """ + Taken from https://github.com/django/django/blob/master/django/utils/text.py + Convert to ASCII if 'allow_unicode' is False. Convert spaces or repeated + dashes to single dashes. Remove characters that aren't alphanumerics, + underscores, or hyphens. Convert to lowercase. Also strip leading and + trailing whitespace, dashes, and underscores. + """ + if allow_unicode: + value = unicodedata.normalize("NFKC", value) + else: + value = ( + unicodedata.normalize("NFKD", value) + .encode("ascii", "ignore") + .decode("ascii") + ) + value = re_sub(r"[^\w\s-]", "", value.lower()) + return re_sub(r"[-\s]+", "-", value).strip("-_") diff --git a/examples/config.yml b/examples/config.yml index 2a12cb1..cca6602 100644 --- a/examples/config.yml +++ b/examples/config.yml @@ -29,6 +29,9 @@ assets: # optional export of all the images used in a page(s). # omit this or set to false if not needed export_images: false + # optional export of all the attachments used in a page(s). + # omit this or set to false if not needed + export_attachments: false # optional modify markdown files to replace image url links # with local exported image paths modify_markdown: false diff --git a/examples/minio_config.yml b/examples/minio_config.yml index 692cacd..3b36a08 100644 --- a/examples/minio_config.yml +++ b/examples/minio_config.yml @@ -58,6 +58,9 @@ assets: # optional export of all the images used in a page(s). # omit this or set to false if not needed export_images: false + # optional export of all the attachments used in a page(s). + # omit this or set to false if not needed + export_attachments: false # optional modify markdown files to replace image url links # with local exported image paths modify_markdown: false diff --git a/setup.cfg b/setup.cfg index ef4786a..dffaf0c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = bookstack-file-exporter -# version will be replaced by IMAGE_TAG in Makefile +# version will be replaced by IMAGE_TAG via Github Actions version = 0.0.1 author = pchang388 # author_email = your@email.address @@ -18,9 +18,9 @@ classifiers = python_requires = >=3.8 install_requires = Pyyaml >= 6.0.1 # https://pypi.org/project/PyYAML/ - Pydantic >= 2.4.0 # https://docs.pydantic.dev/latest/ + Pydantic >= 2.5.3 # https://docs.pydantic.dev/latest/ requests >= 2.31.0 # https://pypi.org/project/requests/ - minio >= 7.2.0 # https://pypi.org/project/minio/ + minio >= 7.2.3 # https://pypi.org/project/minio/ packages = find: [options.entry_points]