From b68a9598f318f75d0043cc15a37a9188af1ba471 Mon Sep 17 00:00:00 2001 From: pchang388 Date: Tue, 29 Oct 2024 21:12:56 -0400 Subject: [PATCH 1/5] bump dep versions --- setup.cfg | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/setup.cfg b/setup.cfg index ef0c2bf..2cc3992 100644 --- a/setup.cfg +++ b/setup.cfg @@ -17,10 +17,10 @@ classifiers = [options] python_requires = >=3.8 install_requires = - Pyyaml >= 6.0.1 # https://pypi.org/project/PyYAML/ - Pydantic >= 2.8.2 # https://docs.pydantic.dev/latest/ + Pyyaml >= 6.0.2 # https://pypi.org/project/PyYAML/ + Pydantic >= 2.9.2 # https://docs.pydantic.dev/latest/ requests >= 2.32.3 # https://pypi.org/project/requests/ - minio >= 7.2.7 # https://pypi.org/project/minio/ + minio >= 7.2.10 # https://pypi.org/project/minio/ packages = find: [options.entry_points] From 92af2826fef0f007866e2378691d936f87ebc6fc Mon Sep 17 00:00:00 2001 From: pchang388 Date: Thu, 31 Oct 2024 00:46:49 -0400 Subject: [PATCH 2/5] attempt to add logic to skip an image or attachment export if their API call fails --- .../archiver/asset_archiver.py | 7 ++-- .../archiver/page_archiver.py | 35 ++++++++++++++----- bookstack_file_exporter/exporter/node.py | 1 + bookstack_file_exporter/run.py | 1 + 4 files changed, 33 insertions(+), 11 deletions(-) diff --git a/bookstack_file_exporter/archiver/asset_archiver.py b/bookstack_file_exporter/archiver/asset_archiver.py index 71f2583..dbea2bf 100644 --- a/bookstack_file_exporter/archiver/asset_archiver.py +++ b/bookstack_file_exporter/archiver/asset_archiver.py @@ -24,7 +24,7 @@ class AssetNode: AssetNode instance for use in other classes """ def __init__(self, meta_data: Dict[str, int | str | bool]): - self.id: int = meta_data['id'] + self.id_: int = meta_data['id'] self.page_id: int = meta_data['uploaded_to'] self.url: str = "" self.name: str = "" @@ -88,7 +88,7 @@ class AttachmentNode(AssetNode): def __init__(self, meta_data: Dict[str, Union[int, str, bool]], base_url: str): super().__init__(meta_data) - self.url: str = f"{base_url}/{self.id}" + self.url: str = f"{base_url}/{self.id_}" self.name = meta_data['name'] log.debug("Attachment node has generated url: %s", self.url) self._relative_path_prefix = f"{_ATTACHMENT_DIR_NAME}" @@ -140,7 +140,7 @@ def get_asset_nodes(self, asset_type: str) -> Dict[str, ImageNode | AttachmentNo def get_asset_data(self, asset_type: str, meta_data: Union[AttachmentNode, ImageNode]) -> Dict[str, str | bool | int | dict]: """Get asset data based on type""" - data_url = f"{self.api_urls[asset_type]}/{meta_data.id}" + data_url = f"{self.api_urls[asset_type]}/{meta_data.id_}" asset_data_response: Response = common_util.http_get_request( data_url, self._headers, @@ -164,6 +164,7 @@ def update_asset_links(self, asset_type, page_name: str, page_data: bytes, asset_nodes: List[ImageNode | AttachmentNode]) -> bytes: """update markdown links in page data""" for asset_node in asset_nodes: + # get metadata instead of raw data/bytes asset_data = self.get_asset_data(asset_type, asset_node) asset_node.set_markdown_content(asset_data) if not asset_node.markdown_str: diff --git a/bookstack_file_exporter/archiver/page_archiver.py b/bookstack_file_exporter/archiver/page_archiver.py index 4398e27..a1106b3 100644 --- a/bookstack_file_exporter/archiver/page_archiver.py +++ b/bookstack_file_exporter/archiver/page_archiver.py @@ -1,10 +1,12 @@ from typing import Union, List, Dict - +import logging from bookstack_file_exporter.exporter.node import Node from bookstack_file_exporter.archiver import util as archiver_util from bookstack_file_exporter.archiver.asset_archiver import AssetArchiver, ImageNode, AttachmentNode from bookstack_file_exporter.config_helper.config_helper import ConfigNode +log = logging.getLogger(__name__) + _META_FILE_SUFFIX = "_meta.json" _TAR_SUFFIX = ".tar" _TAR_GZ_SUFFIX = ".tgz" @@ -70,6 +72,16 @@ def archive_pages(self, page_nodes: Dict[int, Node]): page_images = image_nodes[page.id_] if page.id_ in attachment_nodes: page_attachments = attachment_nodes[page.id_] + failed_images = self.archive_page_assets("images", page.parent.file_path, + page.name, page_images) + failed_attach = self.archive_page_assets("attachments", page.parent.file_path, + page.name, page_attachments) + if failed_images: + # exclude from page_images so it doesn't attempt to get modified in markdown file + page_images = [img for img in page_images if img.id_ not in failed_images] + if failed_attach: + # exclude from page_attachments so it doesn't attempt to get modified in markdown file + page_attachments = [attach for attach in page_attachments if attach.id_ not in failed_attach] for export_format in self.export_formats: page_data = self._get_page_data(page.id_, export_format) if page_images and export_format == 'markdown': @@ -80,10 +92,6 @@ def archive_pages(self, page_nodes: Dict[int, Node]): page_data, page_attachments) self._archive_page(page, export_format, page_data) - self.archive_page_assets("images", page.parent.file_path, - page.name, page_images) - self.archive_page_assets("attachments", page.parent.file_path, - page.name, page_attachments) if self.asset_config.export_meta: self._archive_page_meta(page.file_path, page.meta) @@ -123,15 +131,26 @@ def _modify_markdown(self, asset_type: str, asset_nodes) def archive_page_assets(self, asset_type: str, parent_path: str, page_name: str, - asset_nodes: List[ImageNode | AttachmentNode]): + asset_nodes: List[ImageNode | AttachmentNode]) -> Dict[int, int]: """pull images locally into a directory based on page""" if not asset_nodes: - return + return {} + # use a map for faster lookup + failed_assets = {} node_base_path = f"{self.archive_base_path}/{parent_path}/" for asset_node in asset_nodes: - asset_data = self.asset_archiver.get_asset_bytes(asset_type, asset_node.url) + try: + asset_data = self.asset_archiver.get_asset_bytes(asset_type, asset_node.url) + except: + # probably unnecessary, but just in case + if asset_node.id_ not in failed_assets: + failed_assets[asset_node.id_] = 0 + # a 404 or other error occurred, skip this asset, already logged in http request exception + log.error(f"Failed to get image or attachment data for asset located at: {asset_node.url} - skipping") + continue asset_path = f"{node_base_path}/{asset_node.get_relative_path(page_name)}" self.write_data(asset_path, asset_data) + return failed_assets def write_data(self, file_path: str, data: bytes): """write data to a tar file diff --git a/bookstack_file_exporter/exporter/node.py b/bookstack_file_exporter/exporter/node.py index 6fa5a86..5abe41d 100644 --- a/bookstack_file_exporter/exporter/node.py +++ b/bookstack_file_exporter/exporter/node.py @@ -38,6 +38,7 @@ def __init__(self, meta: Dict[str, Union[str, int]], # for convenience/usage for exporter # self.name: str = self.meta['slug'] self.name = self.get_name(self.meta['slug'], self.meta['name']) + # id() is a built-in function and should not be used as a variable name self.id_: int = self.meta['id'] self._display_name = self.meta['name'] # children diff --git a/bookstack_file_exporter/run.py b/bookstack_file_exporter/run.py index 2ad787c..20f586d 100644 --- a/bookstack_file_exporter/run.py +++ b/bookstack_file_exporter/run.py @@ -53,4 +53,5 @@ def exporter(args: argparse.Namespace): # clean up the .tgz archive since it is already uploaded archive.clean_up() + log.info(f"Created file archive: {archive.archive_dir}.tgz") log.info("Completed run") From 43700558ae75a764289497db35f7627b73cbf771 Mon Sep 17 00:00:00 2001 From: pchang388 Date: Thu, 31 Oct 2024 01:04:47 -0400 Subject: [PATCH 3/5] update base image to use python 3.13.0 --- Dockerfile | 2 +- Makefile | 12 +++++++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index c493943..fd258fa 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,5 @@ ARG BASE_IMAGE=python -ARG BASE_IMAGE_TAG=3.12.4-slim-bookworm +ARG BASE_IMAGE_TAG=3.13.0-slim-bookworm FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} diff --git a/Makefile b/Makefile index 93ae354..8957bef 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ ## DOCKER BUILD VARS BASE_IMAGE=python -BASE_IMAGE_TAG=3.12.4-slim-bookworm +BASE_IMAGE_TAG=3.13.0-slim-bookworm IMAGE_NAME=homeylab/bookstack-file-exporter # keep this start sequence unique (IMAGE_TAG=) # github actions will use this to create a tag @@ -27,6 +27,16 @@ upload_testpypi: download_testpypi: python -m pip install --index-url https://test.pypi.org/simple/ --extra-index-url https://pypi.org/simple bookstack-file-exporter +docker_build_simple: + docker build \ + --build-arg BASE_IMAGE=${BASE_IMAGE} \ + --build-arg BASE_IMAGE_TAG=${BASE_IMAGE_TAG} \ + --build-arg DOCKER_WORK_DIR=${DOCKER_WORK_DIR} \ + --build-arg DOCKER_CONFIG_DIR=${DOCKER_CONFIG_DIR} \ + --build-arg DOCKER_EXPORT_DIR=${DOCKER_EXPORT_DIR} \ + -t ${IMAGE_NAME}:${IMAGE_TAG} \ + --no-cache . + docker_build: docker buildx build \ --platform linux/amd64,linux/arm64 \ From cd553b7bf8d43b2bb311484468282ee05764d6ac Mon Sep 17 00:00:00 2001 From: pchang388 Date: Thu, 31 Oct 2024 01:05:51 -0400 Subject: [PATCH 4/5] bump test python versions to match --- .github/actions/python/action.yml | 2 +- .github/actions/tests/action.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/actions/python/action.yml b/.github/actions/python/action.yml index 677c051..e101b14 100644 --- a/.github/actions/python/action.yml +++ b/.github/actions/python/action.yml @@ -23,7 +23,7 @@ runs: - name: Set up Python uses: actions/setup-python@v3 with: - python-version: '3.12.4' + python-version: '3.13.0' - name: Install Dependencies shell: bash run: | diff --git a/.github/actions/tests/action.yml b/.github/actions/tests/action.yml index 3b7ed0c..97805a3 100644 --- a/.github/actions/tests/action.yml +++ b/.github/actions/tests/action.yml @@ -8,7 +8,7 @@ runs: - name: Set up Python uses: actions/setup-python@v3 with: - python-version: '3.12.4' + python-version: '3.13.0' - name: Install dependencies shell: bash run: | From 82fae8cfa588009666e037c8ca39235b1360bd11 Mon Sep 17 00:00:00 2001 From: pchang388 Date: Thu, 31 Oct 2024 01:32:15 -0400 Subject: [PATCH 5/5] pylint adjustments --- .devcontainer/devcontainer.json | 2 +- Makefile | 3 +++ .../archiver/page_archiver.py | 19 +++++++++++++------ bookstack_file_exporter/run.py | 2 +- 4 files changed, 18 insertions(+), 8 deletions(-) diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 82d6346..14b78b0 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -7,7 +7,7 @@ "features": { "ghcr.io/devcontainers/features/python:1": { "installTools": true, - "version": "3.12.4" + "version": "3.13.0" } }, "customizations": { diff --git a/Makefile b/Makefile index 8957bef..3eef2ba 100644 --- a/Makefile +++ b/Makefile @@ -19,6 +19,9 @@ build: python -m pip install --upgrade build python -m build +lint: + pylint bookstack_file_exporter + upload_testpypi: python -m pip install --upgrade twine python -m twine upload --repository testpypi dist/* diff --git a/bookstack_file_exporter/archiver/page_archiver.py b/bookstack_file_exporter/archiver/page_archiver.py index a1106b3..84ba201 100644 --- a/bookstack_file_exporter/archiver/page_archiver.py +++ b/bookstack_file_exporter/archiver/page_archiver.py @@ -1,5 +1,7 @@ from typing import Union, List, Dict import logging +# pylint: disable=import-error +from requests.exceptions import HTTPError from bookstack_file_exporter.exporter.node import Node from bookstack_file_exporter.archiver import util as archiver_util from bookstack_file_exporter.archiver.asset_archiver import AssetArchiver, ImageNode, AttachmentNode @@ -76,12 +78,15 @@ def archive_pages(self, page_nodes: Dict[int, Node]): page.name, page_images) failed_attach = self.archive_page_assets("attachments", page.parent.file_path, page.name, page_attachments) + # exclude from page_images + # so it doesn't attempt to get modified in markdown file if failed_images: - # exclude from page_images so it doesn't attempt to get modified in markdown file page_images = [img for img in page_images if img.id_ not in failed_images] + # exclude from page_attachments + # so it doesn't attempt to get modified in markdown file if failed_attach: - # exclude from page_attachments so it doesn't attempt to get modified in markdown file - page_attachments = [attach for attach in page_attachments if attach.id_ not in failed_attach] + page_attachments = [attach for attach in page_attachments + if attach.id_ not in failed_attach] for export_format in self.export_formats: page_data = self._get_page_data(page.id_, export_format) if page_images and export_format == 'markdown': @@ -141,12 +146,14 @@ def archive_page_assets(self, asset_type: str, parent_path: str, page_name: str, for asset_node in asset_nodes: try: asset_data = self.asset_archiver.get_asset_bytes(asset_type, asset_node.url) - except: + except HTTPError: # probably unnecessary, but just in case if asset_node.id_ not in failed_assets: failed_assets[asset_node.id_] = 0 - # a 404 or other error occurred, skip this asset, already logged in http request exception - log.error(f"Failed to get image or attachment data for asset located at: {asset_node.url} - skipping") + # a 404 or other error occurred + # skip this asset + log.error("Failed to get image or attachment data " \ + "for asset located at: %s - skipping", asset_node.url) continue asset_path = f"{node_base_path}/{asset_node.get_relative_path(page_name)}" self.write_data(asset_path, asset_data) diff --git a/bookstack_file_exporter/run.py b/bookstack_file_exporter/run.py index 20f586d..f63a215 100644 --- a/bookstack_file_exporter/run.py +++ b/bookstack_file_exporter/run.py @@ -53,5 +53,5 @@ def exporter(args: argparse.Namespace): # clean up the .tgz archive since it is already uploaded archive.clean_up() - log.info(f"Created file archive: {archive.archive_dir}.tgz") + log.info("Created file archive: %s.tgz", archive.archive_dir) log.info("Completed run")