diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 348dd66..16d03db 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -7,7 +7,7 @@ "features": { "ghcr.io/devcontainers/features/python:1": { "installTools": true, - "version": "3.12" + "version": "3.12.1" } }, "customizations": { diff --git a/Makefile b/Makefile index 998062a..08edc58 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,7 @@ BASE_IMAGE_TAG=3.12-slim-bookworm IMAGE_NAME=homeylab/bookstack-file-exporter # keep this start sequence unique (IMAGE_TAG=) # github actions will use this to create a tag -IMAGE_TAG=1.0.1 +IMAGE_TAG=1.0.2 DOCKER_WORK_DIR=/export DOCKER_CONFIG_DIR=/export/config DOCKER_EXPORT_DIR=/export/dump diff --git a/README.md b/README.md index b204885..ed5d612 100644 --- a/README.md +++ b/README.md @@ -69,11 +69,11 @@ host: "https://bookstack.yourdomain.com" credentials: token_id: "" token_secret: "" -formats: +formats: # md only example - markdown -- html -- pdf -- plaintext +# - html +# - pdf +# - plaintext output_path: "bkps/" assets: export_images: false @@ -176,33 +176,12 @@ Env variables for credentials will take precedence over configuration file optio **For object storage authentication**, find the relevant sections further down in their respective sections. ### Configuration -See below for an example and explanation. Optionally, look at `examples/` folder of the github repo for more examples. Ensure [Authentication](#authentication) has been set up beforehand for required credentials. +_Ensure [Authentication](#authentication) has been set up beforehand for required credentials._ For a simple example to run quickly, refer to the one in the [Using This Application](#using-this-application) section. A full example is also shown below with descriptions. Optionally, look at `examples/` folder of the github repo for more examples. For object storage configuration, find more information in their respective sections - [Minio](#minio-backups) -> Schema and values are checked so ensure proper settings are provided. As mentioned, credentials can be specified as environment variables instead if preferred. - -#### Just Run -Below is an example configuration to just get quickly running without any additional options. - -```yaml -host: "https://bookstack.yourdomain.com" -credentials: - token_id: "" - token_secret: "" -formats: # md only example -- markdown -# - html -# - pdf -# - plaintext -output_path: "bkps/" -assets: - export_images: false - modify_markdown: false - export_meta: false - verify_ssl: true - ``` +**Schema and values are checked so ensure proper settings are provided. As mentioned, credentials can be specified as environment variables instead if preferred.** #### Full Example Below is an example configuration that shows example values for all possible options. diff --git a/bookstack_file_exporter/archiver/page_archiver.py b/bookstack_file_exporter/archiver/page_archiver.py index 9c0b07a..812da99 100644 --- a/bookstack_file_exporter/archiver/page_archiver.py +++ b/bookstack_file_exporter/archiver/page_archiver.py @@ -52,15 +52,15 @@ def get_image_relative_path(self, page_name: str) -> str: """return image path local to page directory""" return f"{self._relative_path_prefix}/{page_name}/{self.name}" + def set_markdown_content(self, img_details: Dict[str, Union[int, str]]): + """provide image metadata to set markdown properties""" + self._markdown_str = self._get_md_url_str(img_details) + @property def markdown_str(self): """return markdown url str to replace""" return self._markdown_str - def set_markdown_content(self, img_details: Dict[str, Union[int, str]]): - """provide image metadata to set markdown properties""" - self._markdown_str = self._get_md_url_str(img_details) - @staticmethod def _get_md_url_str(img_data: Dict[str, Union[int, str]]) -> str: url_str = "" @@ -70,6 +70,8 @@ def _get_md_url_str(img_data: Dict[str, Union[int, str]]) -> str: # check to see if empty before doing find if not url_str: return "" + # find the link between two parenthesis + # - markdown format return url_str[url_str.find("(")+1:url_str.find(")")] # pylint: disable=too-many-instance-attributes diff --git a/bookstack_file_exporter/common/util.py b/bookstack_file_exporter/common/util.py index ff9e9bc..4591e9d 100644 --- a/bookstack_file_exporter/common/util.py +++ b/bookstack_file_exporter/common/util.py @@ -26,6 +26,15 @@ def http_get_request(url: str, headers: Dict[str, str], except Exception as req_err: log.error("Failed to make request for %s", url) raise req_err + try: + #raise_for_status() throws an exception on codes 400-599 + response.raise_for_status() + except requests.exceptions.HTTPError as e: + # this means it either exceeded 50X retries in `http_get_request` handler + # or it returned a 40X which is not expected + log.error("Bookstack request failed with status code: %d on url: %s", + response.status_code, url) + raise e return response def should_verify(url: str) -> str: diff --git a/bookstack_file_exporter/config_helper/config_helper.py b/bookstack_file_exporter/config_helper/config_helper.py index 0c296bd..46911b2 100644 --- a/bookstack_file_exporter/config_helper/config_helper.py +++ b/bookstack_file_exporter/config_helper/config_helper.py @@ -135,6 +135,7 @@ def _generate_urls(self) -> Dict[str, str]: url_prefix = "" for key, value in _API_PATHS.items(): urls[key] = f"{url_prefix}{self.user_inputs.host}/{value}" + log.debug("api urls: %s", urls) return urls def _set_base_dir(self, cmd_output_dir: str) -> str: diff --git a/bookstack_file_exporter/exporter/exporter.py b/bookstack_file_exporter/exporter/exporter.py index fae484a..910c30c 100644 --- a/bookstack_file_exporter/exporter/exporter.py +++ b/bookstack_file_exporter/exporter/exporter.py @@ -1,6 +1,9 @@ from typing import Dict, List, Union import logging +# pylint: disable=import-error +from requests import Response + from bookstack_file_exporter.exporter.node import Node from bookstack_file_exporter.common import util @@ -35,7 +38,7 @@ def get_all_shelves(self) -> Dict[int, Node]: def _get_json_response(self, url: str) -> List[Dict[str, Union[str,int]]]: """get http response data in json format""" - response = util.http_get_request(url=url, headers=self.headers, + response: Response = util.http_get_request(url=url, headers=self.headers, verify_ssl=self.verify_ssl) return response.json() @@ -77,42 +80,52 @@ def _get_chapters(self, base_url: str, all_chapters: List[int], return chapter_nodes def get_child_nodes(self, resource_type: str, parent_nodes: Dict[int, Node], - filter_empty: bool = True) -> Dict[int, Node]: + filter_empty: bool = True, node_type: str = "") -> Dict[int, Node]: """get child nodes from a book/chapter/shelf""" base_url = self.api_urls[resource_type] - return self._get_children(base_url, parent_nodes, filter_empty) + return self._get_children(base_url, parent_nodes, filter_empty, node_type) def _get_children(self, base_url: str, parent_nodes: Dict[int, Node], - filter_empty: bool) -> Dict[int, Node]: + filter_empty: bool, node_type: str = "") -> Dict[int, Node]: child_nodes = {} for _, parent in parent_nodes.items(): if parent.children: for child in parent.children: + if node_type: + # only used for Book Nodes to get children Page/Chapter Nodes + # access key directly, don't create a Node if not needed + # chapters and pages always have `type` from what I can tell + if not child['type'] == node_type: + log.debug("Book Node child of type: %s is not desired type: %s", + child['type'], node_type) + continue child_id = child['id'] child_url = f"{base_url}/{child_id}" child_data = self._get_json_response(child_url) child_node = Node(child_data, parent) if filter_empty: + # if it is not empty, add it + # skip it if empty if not child_node.empty: child_nodes[child_id] = child_node else: child_nodes[child_id] = child_node return child_nodes - def get_unassigned_books(self, existing_resources: Dict[int, Node], + def get_unassigned_books(self, existing_books: Dict[int, Node], path_prefix: str) -> Dict[int, Node]: """get books not under a shelf""" - base_url = self.api_urls["books"] - all_resources: List[int] = self._get_all_ids(base_url) + book_url = self.api_urls["books"] + all_books: List[int] = self._get_all_ids(book_url) unassigned = [] - # get all existing ones and compare against current known resources - for resource_id in all_resources: - if resource_id not in existing_resources: - unassigned.append(resource_id) + # get all existing ones and compare against current known books + for book in all_books: + if book not in existing_books: + unassigned.append(book) if not unassigned: return {} # books with no shelf treated like a parent resource - return self._get_parents(base_url, unassigned, path_prefix) + return self._get_parents(book_url, unassigned, path_prefix) # convenience function def get_all_books(self, shelve_nodes: Dict[int, Node], unassigned_dir: str) -> Dict[int, Node]: @@ -140,7 +153,10 @@ def get_all_pages(self, book_nodes: Dict[int, Node]) -> Dict[int, Node]: ## pages page_nodes = {} if book_nodes: - page_nodes: Dict[int, Node] = self.get_child_nodes("pages", book_nodes) + # add `page` flag, we only want pages + # filter out chapters for now + # chapters can have their own children/pages + page_nodes: Dict[int, Node] = self.get_child_nodes("pages", book_nodes, node_type="page") ## chapters (if exists) # chapter nodes are treated a little differently # chapters are children under books