diff --git a/.github/workflows/on_release.yml b/.github/workflows/on_release.yml index 3cbe760..59e11de 100644 --- a/.github/workflows/on_release.yml +++ b/.github/workflows/on_release.yml @@ -54,6 +54,8 @@ jobs: timeout-minutes: 20 environment: 'PyPi' steps: + - name: Checkout + uses: actions/checkout@v4 - name: Get tag release without v shell: bash run: | diff --git a/README.md b/README.md index b8fc1b3..5fa5732 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,7 @@ Table of Contents - [Using This Application](#using-this-application) - [Run via Pip](#run-via-pip) - [Run via Docker](#run-via-docker) + - [Run via Helm](#run-via-helm) - [Authentication and Permissions](#authentication-and-permissions) - [Configuration](#configuration) - [Backup Behavior](#backup-behavior) @@ -16,6 +17,7 @@ Table of Contents - [Modify Markdown Files](#modify-markdown-files) - [Object Storage](#object-storage) - [Minio Backups](#minio-backups) + - [Potential Breaking Upgrades](#potential-breaking-upgrades) - [Future Items](#future-items) ## Background @@ -68,8 +70,8 @@ Simple example configuration: # config.yml host: "https://bookstack.yourdomain.com" credentials: - token_id: "" - token_secret: "" + token_id: "" + token_secret: "" formats: # md only example - markdown # - html @@ -77,11 +79,10 @@ formats: # md only example # - plaintext output_path: "bkps/" assets: - export_images: false - export_attachments: false - modify_markdown: false - export_meta: false - verify_ssl: true + export_images: false + export_attachments: false + modify_markdown: false + export_meta: false ``` ### Run via Pip @@ -180,6 +181,9 @@ docker run \ | `config` | `/export/config/config.yml` | A valid configuration file |`-v /local/yourpath/config.yml:/export/config/config.yml:ro`| | `dump` | `/export/dump` | Directory to place exports. **This is optional when using remote storage option(s)**. Omit if you don't need a local copy. | `-v /local/yourpath/bkps:/export/dump` | +### Run via Helm +A helm chart can be used to run the exporter as a CronJob or Deployment resource. See [here](https://github.com/homeylab/helm-charts/tree/main/charts/bookstack-file-exporter) for more information on using the helm chart. + ### Authentication and Permissions #### Permissions **Note visibility of pages is based on user**, so use a user that has read access to pages and content you want to back up. *The role assigned to the user* should have the additional permissions for target pages and their content: @@ -201,7 +205,9 @@ Env variables for credentials will take precedence over configuration file optio **For object storage authentication**, find the relevant sections further down in their respective sections. ### Configuration -_Ensure [Authentication](#authentication-and-permissions) has been set up beforehand for required credentials._ For a simple example to run quickly, refer to the one in the [Using This Application](#using-this-application) section. A full example is also shown below with descriptions. Optionally, look at `examples/` folder of the github repo for more examples. +_Ensure [Authentication](#authentication-and-permissions) has been set up beforehand for required credentials._ For a simple example to run quickly, refer to the one in the [Using This Application](#using-this-application) section. + +A full example is also shown below. Optionally, look at `examples/` folder of the github repo for more examples with long descriptions. For object storage configuration, find more information in their respective sections - [Minio](#minio-backups) @@ -214,17 +220,21 @@ Below is an example configuration that shows example values for all possible opt ```yaml host: "https://bookstack.yourdomain.com" credentials: - token_id: "" - token_secret: "" -additional_headers: - test: "test" - test2: "test2" - User-Agent: "test-agent" + token_id: "" + token_secret: "" formats: - markdown - html - pdf - plaintext +http_config: + verify_ssl: false + timeout: 30 + backoff_factor: 2.5 + retry_codes: [413, 429, 500, 502, 503, 504] + retry_count: 5 + additional_headers: + User-Agent: "test-agent" minio: host: "minio.yourdomain.com" access_key: "" @@ -239,7 +249,6 @@ assets: export_attachments: true modify_markdown: false export_meta: false - verify_ssl: true keep_last: 5 run_interval: 0 ``` @@ -249,20 +258,25 @@ More descriptions can be found for each section below: | Configuration Item | Type | Required | Description | | ------------------ | ---- | -------- | ----------- | -| `host` | `str` | `true` | If `http/https` not specified in the url, defaults to `https`. Use `assets.verify_ssl` to disable certificate checking. | +| `host` | `str` | `true` | If `http/https` not specified in the url, defaults to `https`. Use `http_config.verify_ssl` to disable certificate checking. | | `credentials` | `object` | `false` | Optional section where Bookstack tokenId and tokenSecret can be specified. Env variable for credentials may be supplied instead. See [Authentication](#authentication) for more details. | -| `credentials.token_id` | `str`| `true` if `credentials` | If `credentials` section is given, this should be a valid tokenId | -| `credentials.token_secret` | `str` | `true` if `credentials`| If `credentials` section is given, this should be a valid tokenSecret | -| `additional_headers` | `object` | `false` | Optional section where key/value for pairs can be specified to use in Bookstack http request headers. +| `credentials.token_id` | `str`| `false` if specified through env var instead, otherwise `true` | A valid Bookstack tokenId. | +| `credentials.token_secret` | `str` | `false` if specified through env var instead, otherwise `true` | A valid Bookstack tokenSecret. | | `formats` | `list` | `true` | Which export formats to use for Bookstack page content. Valid options are: `["markdown", "html", "pdf", "plaintext"]`| | `output_path` | `str` | `false` | Optional (default: `cwd`) which directory (relative or full path) to place exports. User who runs the command should have access to read/write to this directory. This directory and any parent directories will be attempted to be created if they do not exist. If not provided, will use current run directory by default. If using docker, this option can be omitted. | | `assets` | `object` | `false` | Optional section to export additional assets from pages. | | `assets.export_images` | `bool` | `false` | Optional (default: `false`), export all images for a page to an `image` directory within page directory. See [Backup Behavior](#backup-behavior) for more information on layout | | `assets.export_attachments` | `bool` | `false` | Optional (default: `false`), export all attachments for a page to an `attachments` directory within page directory. See [Backup Behavior](#backup-behavior) for more information on layout | -| `assets.modify_markdown` | `bool` | `false` | Optional (default: `false`), modify markdown files to replace image links with local exported image paths. This requires `assets.export_images` to be `true` in order to work. See [Modify Markdown Files](#modify-markdown-files) for more information. -| `assets.export_meta` | `bool` | `false` | Optional (default: `false`), export of metadata about the page in a json file | -| `assets.verify_ssl` | `bool` | `false` | Optional (default: `true`), whether or not to check ssl certificates when requesting content from Bookstack host | -| `keep_last` | `int` | `false` | Optional (default: `None`), if exporter can delete older archives. valid values are:
- set to `-1` if you want to delete all archives after each run (useful if you only want to upload to object storage)
- set to `1+` if you want to retain a certain number of archives
- `0` will result in no action done | +| `assets.modify_markdown` | `bool` | `false` | Optional (default: `false`), modify markdown files to replace image links with local exported image paths. This requires `assets.export_images` to be `true` in order to work. See [Modify Markdown Files](#modify-markdown-files) for more information. | +| `assets.export_meta` | `bool` | `false` | Optional (default: `false`), export of metadata about the page in a json file. | +| `http_config` | `object` | `false` | Optional section to override default http configuration. | +| `http_config.verify_ssl` | `bool` | `false` | Optional (default: `false`), whether or not to verify ssl certificates if using https. | +| `http_config.timeout` | `int` | `false` | Optional (default: `30`), set the timeout, in seconds, for http requests. | +| `http_config.retry_count` | `int` | `false` | Optional (default: `5`), the number of http retries after initial failure. | +| `http_config.retry_codes` | `List[int]` | `false` | Optional (default: `[413, 429, 500, 502, 503, 504]`), which http response status codes trigger a retry. | +| `http_config.backoff_factor` | `float` | `false` | Optional (default: `2.5`), set the backoff_factor for http request retries. Default backoff_factor `2.5` means we wait 5, 10, 20, and then 40 seconds (with default `http_config.retry_count: 5`) before our last retry. This should allow for per minute rate limits to be refreshed. | +| `http_config.additional_headers` | `object` | `false` | Optional (default: `{}`), specify key/value pairs that will be added as additional headers to http requests. | +| `keep_last` | `int` | `false` | Optional (default: `0`), if exporter can delete older archives. valid values are:
- set to `-1` if you want to delete all archives after each run (useful if you only want to upload to object storage)
- set to `1+` if you want to retain a certain number of archives
- `0` will result in no action done. | | `run_interval` | `int` | `false` | Optional (default: `0`). If specified, exporter will run as an application and pause for `{run_interval}` seconds before subsequent runs. Example: `86400` seconds = `24` hours or run once a day. Setting this property to `0` will invoke a single run and exit. Used for basic scheduling of backups. | | `minio` | `object` | `false` | Optional [Minio](#minio-backups) configuration options. | @@ -464,7 +478,14 @@ minio: | `access_key` | `str` | `false` if specified through env var instead, otherwise `true` | Access key for the minio instance | | `secret_key` | `str` | `false` if specified through env var, otherwise `true` | Secret key for the minio instance | | `path` | `str` | `false` | Optional, path of the backup to use. Will use root bucket path if not set. `://bookstack-.tgz` | -| `keep_last` | `int` | `false` | Optional (default: `None`), if exporter can delete older archives in minio.
- set to `1+` if you want to retain a certain number of archives
- `0` will result in no action done | +| `keep_last` | `int` | `false` | Optional (default: `0`), if exporter can delete older archives in minio.
- set to `1+` if you want to retain a certain number of archives
- `0` will result in no action done | + +## Potential Breaking Upgrades +Below are versions that have major changes to the way configuration or exporter runs. + +| Start Version | Target Version | Description | +| ------------- | -------------- | ----------- | +| `< 1.4.X` | `1.5.0` | `assets.verify_ssl` has been moved to `http_config.verify_ssl` and the default value has been updated to `false`. `additional_headers` has been moved to `http_config.additional_headers` | ## Future Items 1. ~~Be able to pull images locally and place in their respective page folders for a more complete file level backup.~~ diff --git a/bookstack_file_exporter/archiver/archiver.py b/bookstack_file_exporter/archiver/archiver.py index 969ee14..098b1a0 100644 --- a/bookstack_file_exporter/archiver/archiver.py +++ b/bookstack_file_exporter/archiver/archiver.py @@ -9,6 +9,7 @@ from bookstack_file_exporter.archiver.minio_archiver import MinioArchiver from bookstack_file_exporter.config_helper.remote import StorageProviderConfig from bookstack_file_exporter.config_helper.config_helper import ConfigNode +from bookstack_file_exporter.common.util import HttpHelper log = logging.getLogger(__name__) @@ -22,17 +23,18 @@ class Archiver: Args: :config: = Configuration with user inputs and general options. + :http_client: = http helper functions with config from user inputs Returns: Archiver instance with attributes that are accessible for use for handling bookstack exports and remote uploads. """ - def __init__(self, config: ConfigNode): + def __init__(self, config: ConfigNode, http_client: HttpHelper): self.config = config # for convenience self.base_dir = config.base_dir_name self.archive_dir = self._generate_root_folder(self.base_dir) - self._page_archiver = PageArchiver(self.archive_dir, self.config) + self._page_archiver = PageArchiver(self.archive_dir, self.config, http_client) self._remote_exports = {'minio': self._archive_minio, 's3': self._archive_s3} def create_export_dir(self): diff --git a/bookstack_file_exporter/archiver/asset_archiver.py b/bookstack_file_exporter/archiver/asset_archiver.py index dbea2bf..c119e7f 100644 --- a/bookstack_file_exporter/archiver/asset_archiver.py +++ b/bookstack_file_exporter/archiver/asset_archiver.py @@ -5,7 +5,7 @@ # pylint: disable=import-error from requests import Response -from bookstack_file_exporter.common import util as common_util +from bookstack_file_exporter.common.util import HttpHelper log = logging.getLogger(__name__) @@ -112,28 +112,23 @@ class AssetArchiver: Args: :urls: = api urls for images and attachments - :headers: = http headers for api requests - :verify_ssl: = verify ssl for api requests + :http_client: = http helper functions with config from user inputs Returns: AssetArchiver instance for use in archiving images and attachments for a page """ - def __init__(self, urls: Dict[str, str], headers: Dict[str, str], - verify_ssl: bool): + def __init__(self, urls: Dict[str, str], http_client: HttpHelper): self.api_urls = urls - self.verify_ssl = verify_ssl - self._headers = headers self._asset_map = { 'images': self._create_image_map, 'attachments': self._create_attachment_map } + self.http_client = http_client def get_asset_nodes(self, asset_type: str) -> Dict[str, ImageNode | AttachmentNode]: """Get image or attachment helpers for a page""" - asset_response: Response = common_util.http_get_request( - self.api_urls[asset_type], - self._headers, - self.verify_ssl) + asset_response: Response = self.http_client.http_get_request( + self.api_urls[asset_type]) asset_json = asset_response.json()['data'] return self._asset_map[asset_type](asset_json) @@ -141,18 +136,14 @@ def get_asset_data(self, asset_type: str, meta_data: Union[AttachmentNode, ImageNode]) -> Dict[str, str | bool | int | dict]: """Get asset data based on type""" data_url = f"{self.api_urls[asset_type]}/{meta_data.id_}" - asset_data_response: Response = common_util.http_get_request( - data_url, - self._headers, - self.verify_ssl) + asset_data_response: Response = self.http_client.http_get_request( + data_url) return asset_data_response.json() def get_asset_bytes(self, asset_type: str, url: str) -> bytes: """Get raw asset data""" - asset_response: Response = common_util.http_get_request( - url, - self._headers, - self.verify_ssl) + asset_response: Response = self.http_client.http_get_request( + url) match asset_type: case "images": asset_data = asset_response.content diff --git a/bookstack_file_exporter/archiver/page_archiver.py b/bookstack_file_exporter/archiver/page_archiver.py index fba5047..0b6b745 100644 --- a/bookstack_file_exporter/archiver/page_archiver.py +++ b/bookstack_file_exporter/archiver/page_archiver.py @@ -6,6 +6,7 @@ from bookstack_file_exporter.archiver import util as archiver_util from bookstack_file_exporter.archiver.asset_archiver import AssetArchiver, ImageNode, AttachmentNode from bookstack_file_exporter.config_helper.config_helper import ConfigNode +from bookstack_file_exporter.common.util import HttpHelper log = logging.getLogger(__name__) @@ -33,17 +34,16 @@ class PageArchiver: Args: :archive_dir: = directory where data will be put into. - :config: = Configuration with user inputs and general options. + :http_client: = http helper functions with config from user inputs Returns: :PageArchiver: instance with methods to help collect page content from a Bookstack instance. """ - def __init__(self, archive_dir: str, config: ConfigNode) -> None: + def __init__(self, archive_dir: str, config: ConfigNode, http_client: HttpHelper) -> None: self.asset_config = config.user_inputs.assets self.export_formats = config.user_inputs.formats self.api_urls = config.urls - self._headers = config.headers # full path, bookstack-, and .tgz extension self.archive_file = f"{archive_dir}{_FILE_EXTENSION_MAP['tgz']}" # name of intermediate tar file before gzip @@ -51,8 +51,9 @@ def __init__(self, archive_dir: str, config: ConfigNode) -> None: # name of the base folder to use within the tgz archive (internal tar layout) self.archive_base_path = archive_dir.split("/")[-1] self.modify_md: bool = self._check_md_modify() - self.asset_archiver = AssetArchiver(self.api_urls, self._headers, - self.verify_ssl) + self.asset_archiver = AssetArchiver(self.api_urls, + http_client) + self.http_client = http_client def _check_md_modify(self) -> bool: # check to ensure they have asset_config defined, could be None @@ -107,8 +108,8 @@ def _archive_page(self, page: Node, export_format: str, data: bytes): def _get_page_data(self, page_id: int, export_format: str) -> bytes: url = f"{self.api_urls['pages']}/{page_id}/{_EXPORT_API_PATH}/{export_format}" - return archiver_util.get_byte_response(url=url, headers=self._headers, - verify_ssl=self.verify_ssl) + return archiver_util.get_byte_response(url=url, + http_client=self.http_client) def _archive_page_meta(self, page_path: str, meta_data: Dict[str, Union[str, int]]): meta_file_name = f"{self.archive_base_path}/{page_path}{_FILE_EXTENSION_MAP['meta']}" diff --git a/bookstack_file_exporter/archiver/util.py b/bookstack_file_exporter/archiver/util.py index e6bcb8f..21cf8af 100644 --- a/bookstack_file_exporter/archiver/util.py +++ b/bookstack_file_exporter/archiver/util.py @@ -9,13 +9,13 @@ import glob from pathlib import Path -from bookstack_file_exporter.common import util +from bookstack_file_exporter.common.util import HttpHelper log = logging.getLogger(__name__) -def get_byte_response(url: str, headers: Dict[str, str], verify_ssl: bool) -> bytes: +def get_byte_response(url: str, http_client: HttpHelper) -> bytes: """get byte response from http request""" - response = util.http_get_request(url=url, headers=headers, verify_ssl=verify_ssl) + response = http_client.http_get_request(url=url) return response.content # append to a tar file instead of creating files locally and then tar'ing after diff --git a/bookstack_file_exporter/common/util.py b/bookstack_file_exporter/common/util.py index f2ff5f4..02cd807 100644 --- a/bookstack_file_exporter/common/util.py +++ b/bookstack_file_exporter/common/util.py @@ -1,44 +1,72 @@ import logging from typing import Dict +import urllib3 # pylint: disable=import-error import requests # pylint: disable=import-error from requests.adapters import HTTPAdapter, Retry +from bookstack_file_exporter.config_helper.models import HttpConfig + log = logging.getLogger(__name__) -def http_get_request(url: str, headers: Dict[str, str], - verify_ssl: bool, timeout: int = 30) -> requests.Response: - """make http requests and return response object""" - url_prefix = should_verify(url) - try: - with requests.Session() as session: - # {backoff factor} * (2 ** ({number of previous retries})) - # {raise_on_status} if status falls in status_forcelist range - # and retries have been exhausted. - # {status_force_list} 413, 429, 503 defaults are overwritten with additional ones - retries = Retry(total=5, - backoff_factor=0.5, - raise_on_status=True, - status_forcelist=[413, 429, 500, 502, 503, 504]) - session.mount(url_prefix, HTTPAdapter(max_retries=retries)) - response = session.get(url, headers=headers, verify=verify_ssl, timeout=timeout) - except Exception as req_err: - log.error("Failed to make request for %s", url) - raise req_err - try: - #raise_for_status() throws an exception on codes 400-599 - response.raise_for_status() - except requests.exceptions.HTTPError as e: - # this means it either exceeded 50X retries in `http_get_request` handler - # or it returned a 40X which is not expected - log.error("Bookstack request failed with status code: %d on url: %s", - response.status_code, url) - raise e - return response - -def should_verify(url: str) -> str: - """check if http or https""" - if url.startswith("https"): - return "https://" - return "http://" +# disable TLS warnings if using verify_ssl=false +urllib3.disable_warnings() + +class HttpHelper: + """ + HttpHelper provides an http request helper with config stored and retries built in + + Args: + :headers: = all headers to use for http requests + :config: = Configuration with user inputs for http requests + + Returns: + :HttpHelper: instance with methods to help with http requests. + """ + def __init__(self, headers: Dict[str, str], + config: HttpConfig): + self.backoff_factor = config.backoff_factor + self.retry_codes = config.retry_codes + self.retry_count = config.retry_count + self.http_timeout = config.timeout + self.verify_ssl = config.verify_ssl + self._headers = headers + + # more details on options: https://urllib3.readthedocs.io/en/stable/reference/urllib3.util.html + def http_get_request(self, url: str) -> requests.Response: + """make http requests and return response object""" + url_prefix = self.should_verify(url) + try: + with requests.Session() as session: + # {backoff factor} * (2 ** ({number of previous retries})) + # {raise_on_status} if status falls in status_forcelist range + # and retries have been exhausted. + # {status_force_list} 413, 429, 503 defaults are overwritten with additional ones + retries = Retry(total=self.retry_count, + backoff_factor=self.backoff_factor, + raise_on_status=True, + status_forcelist=self.retry_codes) + session.mount(url_prefix, HTTPAdapter(max_retries=retries)) + response = session.get(url, headers=self._headers, verify=self.verify_ssl, + timeout=self.http_timeout) + except Exception as req_err: + log.error("Failed to make request for %s", url) + raise req_err + try: + #raise_for_status() throws an exception on codes 400-599 + response.raise_for_status() + except requests.exceptions.HTTPError as e: + # this means it either exceeded 50X retries in `http_get_request` handler + # or it returned a 40X which is not expected + log.error("Bookstack request failed with status code: %d on url: %s", + response.status_code, url) + raise e + return response + + @staticmethod + def should_verify(url: str) -> str: + """check if http or https""" + if url.startswith("https"): + return "https://" + return "http://" diff --git a/bookstack_file_exporter/config_helper/config_helper.py b/bookstack_file_exporter/config_helper/config_helper.py index e89a861..a1c3fe0 100644 --- a/bookstack_file_exporter/config_helper/config_helper.py +++ b/bookstack_file_exporter/config_helper/config_helper.py @@ -79,11 +79,8 @@ def _generate_config(self, config_file: str) -> models.UserInput: def _generate_credentials(self) -> Tuple[str, str]: # if user provided credentials in config file, load them - token_id = "" - token_secret = "" - if self.user_inputs.credentials: - token_id = self.user_inputs.credentials.token_id - token_secret = self.user_inputs.credentials.token_secret + token_id = self.user_inputs.credentials.token_id + token_secret = self.user_inputs.credentials.token_secret # check to see if env var is specified, if so, it takes precedence token_id = self._check_var(_BOOKSTACK_TOKEN_FIELD, token_id) @@ -98,15 +95,20 @@ def _generate_remote_config(self) -> Dict[str, StorageProviderConfig]: self.user_inputs.minio.access_key) minio_secret_key = self._check_var(_MINIO_SECRET_KEY_FIELD, self.user_inputs.minio.secret_key) + object_config["minio"] = StorageProviderConfig(minio_access_key, minio_secret_key, self.user_inputs.minio) + for platform, config in object_config.items(): + if not config.is_valid(platform): + error_str = "provided " + platform + " configuration is invalid" + raise ValueError(error_str) return object_config def _generate_headers(self) -> Dict[str, str]: headers = {} # add additional_headers provided by user - if self.user_inputs.additional_headers: - for key, value in self.user_inputs.additional_headers.items(): + if self.user_inputs.http_config.additional_headers: + for key, value in self.user_inputs.http_config.additional_headers.items(): headers[key] = value # add default headers diff --git a/bookstack_file_exporter/config_helper/models.py b/bookstack_file_exporter/config_helper/models.py index 153296e..2e89649 100644 --- a/bookstack_file_exporter/config_helper/models.py +++ b/bookstack_file_exporter/config_helper/models.py @@ -5,19 +5,19 @@ # pylint: disable=too-few-public-methods class ObjectStorageConfig(BaseModel): """YAML schema for minio configuration""" - host: str - access_key: Optional[str] = None - secret_key: Optional[str] = None + host: Optional[str] = "" + access_key: Optional[str] = "" + secret_key: Optional[str] = "" bucket: str - path: Optional[str] = None + path: Optional[str] = "" region: str - keep_last: Optional[int] = None + keep_last: Optional[int] = 0 # pylint: disable=too-few-public-methods class BookstackAccess(BaseModel): """YAML schema for bookstack access credentials""" - token_id: str - token_secret: str + token_id: Optional[str] = "" + token_secret: Optional[str] = "" # pylint: disable=too-few-public-methods class Assets(BaseModel): @@ -26,17 +26,25 @@ class Assets(BaseModel): export_attachments: Optional[bool] = False modify_markdown: Optional[bool] = False export_meta: Optional[bool] = False - verify_ssl: Optional[bool] = True + +class HttpConfig(BaseModel): + """YAML schema for user provided http settings""" + verify_ssl: Optional[bool] = False + timeout: Optional[int] = 30 + backoff_factor: Optional[float] = 2.5 + retry_codes: Optional[List[int]] = [413, 429, 500, 502, 503, 504] + retry_count: Optional[int] = 5 + additional_headers: Optional[Dict[str, str]] = {} # pylint: disable=too-few-public-methods class UserInput(BaseModel): """YAML schema for user provided configuration file""" host: str - additional_headers: Optional[Dict[str, str]] = None - credentials: Optional[BookstackAccess] = None + credentials: Optional[BookstackAccess] = BookstackAccess() formats: List[Literal["markdown", "html", "pdf", "plaintext"]] - output_path: Optional[str] = None + output_path: Optional[str] = "" assets: Optional[Assets] = Assets() minio: Optional[ObjectStorageConfig] = None - keep_last: Optional[int] = None - run_interval: Optional[int] = 0 \ No newline at end of file + keep_last: Optional[int] = 0 + run_interval: Optional[int] = 0 + http_config: Optional[HttpConfig] = HttpConfig() diff --git a/bookstack_file_exporter/config_helper/remote.py b/bookstack_file_exporter/config_helper/remote.py index d97dd04..ddcc717 100644 --- a/bookstack_file_exporter/config_helper/remote.py +++ b/bookstack_file_exporter/config_helper/remote.py @@ -1,5 +1,9 @@ +import logging + from bookstack_file_exporter.config_helper.models import ObjectStorageConfig +log = logging.getLogger(__name__) + ## convenience class ## able to work for minio, s3, etc. class StorageProviderConfig: @@ -21,6 +25,7 @@ def __init__(self, access_key: str, secret_key: str, config: ObjectStorageConfig self.config = config self._access_key = access_key self._secret_key = secret_key + self._valid_checker = {'minio': self._is_minio_valid()} @property def access_key(self) -> str: @@ -31,3 +36,19 @@ def access_key(self) -> str: def secret_key(self) -> str: """return secret key for use""" return self._secret_key + + def is_valid(self, storage_type: str) -> bool: + """check if object storage config is valid""" + return self._valid_checker[storage_type] + + def _is_minio_valid(self) -> bool: + """check if minio config is valid""" + # required values - keys and bucket already checked so skip + checks = { + "host": self.config.host + } + for prop, check in checks.items(): + if not check: + log.error("%s is missing from minio configuration and is required", prop) + return False + return True diff --git a/bookstack_file_exporter/exporter/exporter.py b/bookstack_file_exporter/exporter/exporter.py index 52702f2..fd1b3ed 100644 --- a/bookstack_file_exporter/exporter/exporter.py +++ b/bookstack_file_exporter/exporter/exporter.py @@ -5,7 +5,7 @@ from requests import Response from bookstack_file_exporter.exporter.node import Node -from bookstack_file_exporter.common import util +from bookstack_file_exporter.common.util import HttpHelper log = logging.getLogger(__name__) @@ -19,10 +19,9 @@ class NodeExporter(): Returns: NodeExporter instance to handle building shelve/book/chapter/page relations. """ - def __init__(self, api_urls: Dict[str, str], headers: Dict[str,str], verify_ssl: bool): + def __init__(self, api_urls: Dict[str, str], http_client: HttpHelper): self.api_urls = api_urls - self.headers = headers - self.verify_ssl = verify_ssl + self.http_client = http_client def get_all_shelves(self) -> Dict[int, Node]: """ @@ -38,8 +37,7 @@ def get_all_shelves(self) -> Dict[int, Node]: def _get_json_response(self, url: str) -> List[Dict[str, Union[str,int]]]: """get http response data in json format""" - response: Response = util.http_get_request(url=url, headers=self.headers, - verify_ssl=self.verify_ssl) + response: Response = self.http_client.http_get_request(url=url) return response.json() def _get_all_ids(self, url: str) -> List[int]: diff --git a/bookstack_file_exporter/run.py b/bookstack_file_exporter/run.py index 547760f..e7b408e 100644 --- a/bookstack_file_exporter/run.py +++ b/bookstack_file_exporter/run.py @@ -8,6 +8,7 @@ from bookstack_file_exporter.exporter.node import Node from bookstack_file_exporter.exporter.exporter import NodeExporter from bookstack_file_exporter.archiver.archiver import Archiver +from bookstack_file_exporter.common.util import HttpHelper log = logging.getLogger(__name__) @@ -18,7 +19,7 @@ def entrypoint(args: argparse.Namespace): if config.user_inputs.run_interval: while True: exporter(config) - log.info(f"Waiting {config.user_inputs.run_interval} seconds for next run") + log.info("Waiting %s seconds for next run", config.user_inputs.run_interval) # sleep process state time.sleep(config.user_inputs.run_interval) exporter(config) @@ -26,23 +27,21 @@ def entrypoint(args: argparse.Namespace): def exporter(config: ConfigNode): """export bookstack nodes and archive locally and/or remotely""" - ## convenience vars - bookstack_headers = config.headers - api_urls = config.urls - unassigned_dir = config.unassigned_book_dir - verify_ssl = config.user_inputs.assets.verify_ssl - #### Export Data ##### # need to implement pagination for apis log.info("Beginning run") + ## Helper functions with user provided (or defaults) http config + http_client = HttpHelper(config.headers, config.user_inputs.http_config) + ## Use exporter class to get all the resources (pages, books, etc.) and their relationships log.info("Building shelve/book/chapter/page relationships") - export_helper = NodeExporter(api_urls, bookstack_headers, verify_ssl) + export_helper = NodeExporter(config.urls, http_client) ## shelves shelve_nodes: Dict[int, Node] = export_helper.get_all_shelves() ## books - book_nodes: Dict[int, Node] = export_helper.get_all_books(shelve_nodes, unassigned_dir) + book_nodes: Dict[int, Node] = export_helper.get_all_books(shelve_nodes, + config.unassigned_book_dir) ## pages page_nodes: Dict[int, Node] = export_helper.get_all_pages(book_nodes) if not page_nodes: @@ -50,7 +49,7 @@ def exporter(config: ConfigNode): sys.exit(0) log.info("Beginning archive") ## start archive ## - archive: Archiver = Archiver(config) + archive: Archiver = Archiver(config, http_client) # create export directory if not exists archive.create_export_dir() diff --git a/examples/config.yml b/examples/config.yml index 07df0bb..99481d4 100644 --- a/examples/config.yml +++ b/examples/config.yml @@ -3,20 +3,14 @@ # if you put http here, it will try verify=false, not to check certs host: "https://bookstack.mydomain.org" # You could optionally set the bookstack token_id and token_secret here instead of env -# If using env vars instead you can omit/comment out this section +# If using env vars instead you can leave values empty or omit this section credentials: - # set here or as env variable, BOOKSTACK_TOKEN_ID - # env var takes precedence over below - token_id: "" - # set here or as env variable, BOOKSTACK_TOKEN_SECRET - # env var takes precedence over below - token_secret: "" -# optional - additional headers to add, examples below -# if not required, you can omit/comment out section -additional_headers: - test: "test" - test2: "test2" - User-Agent: "test-agent" + # set here or as env variable, BOOKSTACK_TOKEN_ID + # env var takes precedence over below + token_id: "" + # set here or as env variable, BOOKSTACK_TOKEN_SECRET + # env var takes precedence over below + token_secret: "" # supported formats from bookstack below # specify one or more formats: @@ -40,8 +34,28 @@ assets: # like: last update, owner, revision count, etc. # omit this or set to false if not needed export_meta: false - # optional whether or not to check ssl certificates when requesting content from Bookstack host - verify_ssl: true +# optional - can override default http_config +# if not required, you can omit/comment out section +# https://urllib3.readthedocs.io/en/stable/reference/urllib3.util.html +# default backoff_factor 2.5 means we wait 5, 10, 20, and then 40 seconds before our last retry +# - this should allow for per minute rate limits to be refreshed +http_config: + # whether or not to verify ssl certificates if using https + verify_ssl: false + # set http timeout in seconds for requests + timeout: 30 + # default backoff_factor 2.5 means we wait 5, 10, 20, and then 40 seconds before our last retry + # - this should allow for per minute rate limits to be refreshed + backoff_factor: 2.5 + # which status codes trigger retries + retry_codes: [413, 429, 500, 502, 503, 504] + # number of retries + retry_count: 5 + # any additional headers to add to http requests + additional_headers: + test: "test" + test2: "test2" + User-Agent: "test-agent" # directory to export to # relative or full path output_path: "bkps/" diff --git a/examples/minio_config.yml b/examples/minio_config.yml index 56efe9d..517fa3d 100644 --- a/examples/minio_config.yml +++ b/examples/minio_config.yml @@ -3,20 +3,14 @@ # if you put http here, it will try verify=false, not to check certs host: "https://bookstack.mydomain.org" # You could optionally set the bookstack token_id and token_secret here instead of env -# If using env vars instead you can omit/comment out this section +# If using env vars instead you can leave values empty or omit this section credentials: - # set here or as env variable, BOOKSTACK_TOKEN_ID - # env var takes precedence over below - token_id: "" - # set here or as env variable, BOOKSTACK_TOKEN_SECRET - # env var takes precedence over below - token_secret: "" -# additional headers to add, examples below -# if not required, you can omit/comment out -additional_headers: - test: "test" - test2: "test2" - User-Agent: "test-agent" + # set here or as env variable, BOOKSTACK_TOKEN_ID + # env var takes precedence over below + token_id: "" + # set here or as env variable, BOOKSTACK_TOKEN_SECRET + # env var takes precedence over below + token_secret: "" # supported formats from bookstack below # specify one or more formats: @@ -69,13 +63,26 @@ assets: # like: last update, owner, revision count, etc. # omit this or set to false if not needed export_meta: false - # optional whether or not to check ssl certificates when requesting content from Bookstack host - verify_ssl: true -# After uploading to object storage targets, choose to clean up local files -# delete the archive from local filesystem -# optional -# default = false if omitted -clean_up: true +# optional - can override default http_config +# if not required, you can omit/comment out section +# https://urllib3.readthedocs.io/en/stable/reference/urllib3.util.html +http_config: + # whether or not to verify ssl certificates if using https + verify_ssl: false + # set http timeout in seconds for requests + timeout: 30 + # default backoff_factor 2.5 means we wait 5, 10, 20, and then 40 seconds before our last retry + # - this should allow for per minute rate limits to be refreshed + backoff_factor: 2.5 + # which status codes trigger retries + retry_codes: [413, 429, 500, 502, 503, 504] + # number of retries + retry_count: 5 + # any additional headers to add to http requests + additional_headers: + test: "test" + test2: "test2" + User-Agent: "test-agent" # optional if specified exporter can delete older archives # valid values are: # set to -1 if you want to delete all archives after each run