homeylab · pchang388 · Mar 7, 2025 · Mar 5, 2025 · Mar 7, 2025 · Mar 7, 2025
diff --git a/bookstack_file_exporter/archiver/archiver.py b/bookstack_file_exporter/archiver/archiver.py
@@ -9,6 +9,7 @@
 from bookstack_file_exporter.archiver.minio_archiver import MinioArchiver
 from bookstack_file_exporter.config_helper.remote import StorageProviderConfig
 from bookstack_file_exporter.config_helper.config_helper import ConfigNode
+from bookstack_file_exporter.common.util import HttpHelper
 
 log = logging.getLogger(__name__)
 
@@ -22,17 +23,18 @@ class Archiver:
 
     Args:
         :config: <ConfigNode> = Configuration with user inputs and general options.
+        :http_client: <HttpHelper> = http helper functions with config from user inputs
 
     Returns:
         Archiver instance with attributes that are accessible 
         for use for handling bookstack exports and remote uploads.
     """
-    def __init__(self, config: ConfigNode):
+    def __init__(self, config: ConfigNode, http_client: HttpHelper):
         self.config = config
         # for convenience
         self.base_dir = config.base_dir_name
         self.archive_dir = self._generate_root_folder(self.base_dir)
-        self._page_archiver = PageArchiver(self.archive_dir, self.config)
+        self._page_archiver = PageArchiver(self.archive_dir, self.config, http_client)
         self._remote_exports = {'minio': self._archive_minio, 's3': self._archive_s3}
 
     def create_export_dir(self):

diff --git a/bookstack_file_exporter/archiver/asset_archiver.py b/bookstack_file_exporter/archiver/asset_archiver.py
@@ -5,7 +5,7 @@
 # pylint: disable=import-error
 from requests import Response
 
-from bookstack_file_exporter.common import util as common_util
+from bookstack_file_exporter.common.util import HttpHelper
 
 log = logging.getLogger(__name__)
 
@@ -112,47 +112,38 @@ class AssetArchiver:
 
     Args:
         :urls: <Dict[str, str]> = api urls for images and attachments
-        :headers: <Dict[str, str]> = http headers for api requests
-        :verify_ssl: <bool> = verify ssl for api requests
+        :http_client: <HttpHelper> = http helper functions with config from user inputs
 
     Returns:
         AssetArchiver instance for use in archiving images and attachments for a page
     """
-    def __init__(self, urls: Dict[str, str], headers: Dict[str, str],
-                 verify_ssl: bool):
+    def __init__(self, urls: Dict[str, str], http_client: HttpHelper):
         self.api_urls = urls
-        self.verify_ssl = verify_ssl
-        self._headers = headers
         self._asset_map = {
             'images': self._create_image_map,
             'attachments': self._create_attachment_map
         }
+        self.http_client = http_client
 
     def get_asset_nodes(self, asset_type: str) -> Dict[str, ImageNode | AttachmentNode]:
         """Get image or attachment helpers for a page"""
-        asset_response: Response = common_util.http_get_request(
-            self.api_urls[asset_type],
-            self._headers,
-            self.verify_ssl)
+        asset_response: Response = self.http_client.http_get_request(
+            self.api_urls[asset_type])
         asset_json = asset_response.json()['data']
         return self._asset_map[asset_type](asset_json)
 
     def get_asset_data(self, asset_type: str,
             meta_data: Union[AttachmentNode, ImageNode]) -> Dict[str, str | bool | int | dict]:
         """Get asset data based on type"""
         data_url = f"{self.api_urls[asset_type]}/{meta_data.id_}"
-        asset_data_response: Response = common_util.http_get_request(
-            data_url,
-            self._headers,
-            self.verify_ssl)
+        asset_data_response: Response = self.http_client.http_get_request(
+            data_url)
         return asset_data_response.json()
 
     def get_asset_bytes(self, asset_type: str, url: str) -> bytes:
         """Get raw asset data"""
-        asset_response: Response = common_util.http_get_request(
-            url,
-            self._headers,
-            self.verify_ssl)
+        asset_response: Response = self.http_client.http_get_request(
+            url)
         match asset_type:
             case "images":
                 asset_data = asset_response.content

diff --git a/bookstack_file_exporter/archiver/page_archiver.py b/bookstack_file_exporter/archiver/page_archiver.py
@@ -6,6 +6,7 @@
 from bookstack_file_exporter.archiver import util as archiver_util
 from bookstack_file_exporter.archiver.asset_archiver import AssetArchiver, ImageNode, AttachmentNode
 from bookstack_file_exporter.config_helper.config_helper import ConfigNode
+from bookstack_file_exporter.common.util import HttpHelper
 
 log = logging.getLogger(__name__)
 
@@ -33,26 +34,26 @@ class PageArchiver:
 
     Args:
         :archive_dir: <str> = directory where data will be put into.
-
         :config: <ConfigNode> = Configuration with user inputs and general options.
+        :http_client: <HttpHelper> = http helper functions with config from user inputs
 
     Returns:
         :PageArchiver: instance with methods to help collect page content from a Bookstack instance.
     """
-    def __init__(self, archive_dir: str, config: ConfigNode) -> None:
+    def __init__(self, archive_dir: str, config: ConfigNode, http_client: HttpHelper) -> None:
         self.asset_config = config.user_inputs.assets
         self.export_formats = config.user_inputs.formats
         self.api_urls = config.urls
-        self._headers = config.headers
         # full path, bookstack-<timestamp>, and .tgz extension
         self.archive_file = f"{archive_dir}{_FILE_EXTENSION_MAP['tgz']}"
         # name of intermediate tar file before gzip
         self.tar_file = f"{archive_dir}{_FILE_EXTENSION_MAP['tar']}"
         # name of the base folder to use within the tgz archive (internal tar layout)
         self.archive_base_path = archive_dir.split("/")[-1]
         self.modify_md: bool = self._check_md_modify()
-        self.asset_archiver = AssetArchiver(self.api_urls, self._headers,
-                                            self.verify_ssl)
+        self.asset_archiver = AssetArchiver(self.api_urls,
+                                            http_client)
+        self.http_client = http_client
 
     def _check_md_modify(self) -> bool:
         # check to ensure they have asset_config defined, could be None
@@ -107,8 +108,8 @@ def _archive_page(self, page: Node, export_format: str, data: bytes):
 
     def _get_page_data(self, page_id: int, export_format: str) -> bytes:
         url = f"{self.api_urls['pages']}/{page_id}/{_EXPORT_API_PATH}/{export_format}"
-        return archiver_util.get_byte_response(url=url, headers=self._headers,
-                                               verify_ssl=self.verify_ssl)
+        return archiver_util.get_byte_response(url=url,
+                                               http_client=self.http_client)
 
     def _archive_page_meta(self, page_path: str, meta_data: Dict[str, Union[str, int]]):
         meta_file_name = f"{self.archive_base_path}/{page_path}{_FILE_EXTENSION_MAP['meta']}"

diff --git a/bookstack_file_exporter/archiver/util.py b/bookstack_file_exporter/archiver/util.py
@@ -9,13 +9,13 @@
 import glob
 from pathlib import Path
 
-from bookstack_file_exporter.common import util
+from bookstack_file_exporter.common.util import HttpHelper
 
 log = logging.getLogger(__name__)
 
-def get_byte_response(url: str, headers: Dict[str, str], verify_ssl: bool) -> bytes:
+def get_byte_response(url: str, http_client: HttpHelper) -> bytes:
     """get byte response from http request"""
-    response = util.http_get_request(url=url, headers=headers, verify_ssl=verify_ssl)
+    response = http_client.http_get_request(url=url)
     return response.content
 
 # append to a tar file instead of creating files locally and then tar'ing after

diff --git a/bookstack_file_exporter/common/util.py b/bookstack_file_exporter/common/util.py
@@ -1,44 +1,72 @@
 import logging
 from typing import Dict
+import urllib3
 # pylint: disable=import-error
 import requests
 # pylint: disable=import-error
 from requests.adapters import HTTPAdapter, Retry
 
+from bookstack_file_exporter.config_helper.models import HttpConfig
+
 log = logging.getLogger(__name__)
 
-def http_get_request(url: str, headers: Dict[str, str],
-                     verify_ssl: bool, timeout: int = 30) -> requests.Response:
-    """make http requests and return response object"""
-    url_prefix = should_verify(url)
-    try:
-        with requests.Session() as session:
-            # {backoff factor} * (2 ** ({number of previous retries}))
-            # {raise_on_status} if status falls in status_forcelist range
-            #  and retries have been exhausted.
-            # {status_force_list} 413, 429, 503 defaults are overwritten with additional ones
-            retries = Retry(total=5,
-                            backoff_factor=0.5,
-                            raise_on_status=True,
-                            status_forcelist=[413, 429, 500, 502, 503, 504])
-            session.mount(url_prefix, HTTPAdapter(max_retries=retries))
-            response = session.get(url, headers=headers, verify=verify_ssl, timeout=timeout)
-    except Exception as req_err:
-        log.error("Failed to make request for %s", url)
-        raise req_err
-    try:
-        #raise_for_status() throws an exception on codes 400-599
-        response.raise_for_status()
-    except requests.exceptions.HTTPError as e:
-        # this means it either exceeded 50X retries in `http_get_request` handler
-        # or it returned a 40X which is not expected
-        log.error("Bookstack request failed with status code: %d on url: %s",
-                   response.status_code, url)
-        raise e
-    return response
-
-def should_verify(url: str) -> str:
-    """check if http or https"""
-    if url.startswith("https"):
-        return "https://"
-    return "http://"
+# disable TLS warnings if using verify_ssl=false
+urllib3.disable_warnings()
+
+class HttpHelper:
+    """
+    HttpHelper provides an http request helper with config stored and retries built in
+
+    Args:
+        :headers: <Dict[str, str]> = all headers to use for http requests
+        :config: <HttpConfig> = Configuration with user inputs for http requests
+
+    Returns:
+        :HttpHelper: instance with methods to help with http requests.
+    """
+    def __init__(self, headers: Dict[str, str],
+                 config: HttpConfig):
+        self.backoff_factor = config.backoff_factor
+        self.retry_codes = config.retry_codes
+        self.retry_count = config.retry_count
+        self.http_timeout = config.timeout
+        self.verify_ssl = config.verify_ssl
+        self._headers = headers
+
+    # more details on options: https://urllib3.readthedocs.io/en/stable/reference/urllib3.util.html
+    def http_get_request(self, url: str) -> requests.Response:
+        """make http requests and return response object"""
+        url_prefix = self.should_verify(url)
+        try:
+            with requests.Session() as session:
+                # {backoff factor} * (2 ** ({number of previous retries}))
+                # {raise_on_status} if status falls in status_forcelist range
+                #  and retries have been exhausted.
+                # {status_force_list} 413, 429, 503 defaults are overwritten with additional ones
+                retries = Retry(total=self.retry_count,
+                                backoff_factor=self.backoff_factor,
+                                raise_on_status=True,
+                                status_forcelist=self.retry_codes)
+                session.mount(url_prefix, HTTPAdapter(max_retries=retries))
+                response = session.get(url, headers=self._headers, verify=self.verify_ssl,
+                                       timeout=self.http_timeout)
+        except Exception as req_err:
+            log.error("Failed to make request for %s", url)
+            raise req_err
+        try:
+            #raise_for_status() throws an exception on codes 400-599
+            response.raise_for_status()
+        except requests.exceptions.HTTPError as e:
+            # this means it either exceeded 50X retries in `http_get_request` handler
+            # or it returned a 40X which is not expected
+            log.error("Bookstack request failed with status code: %d on url: %s",
+                    response.status_code, url)
+            raise e
+        return response
+
+    @staticmethod
+    def should_verify(url: str) -> str:
+        """check if http or https"""
+        if url.startswith("https"):
+            return "https://"
+        return "http://"
diff --git a/bookstack_file_exporter/config_helper/config_helper.py b/bookstack_file_exporter/config_helper/config_helper.py
@@ -105,8 +105,8 @@ def _generate_remote_config(self) -> Dict[str, StorageProviderConfig]:
     def _generate_headers(self) -> Dict[str, str]:
         headers = {}
         # add additional_headers provided by user
-        if self.user_inputs.additional_headers:
-            for key, value in self.user_inputs.additional_headers.items():
+        if self.user_inputs.http_config.additional_headers:
+            for key, value in self.user_inputs.http_config.additional_headers.items():
                 headers[key] = value
 
         # add default headers

diff --git a/bookstack_file_exporter/config_helper/models.py b/bookstack_file_exporter/config_helper/models.py
@@ -26,17 +26,26 @@ class Assets(BaseModel):
     export_attachments: Optional[bool] = False
     modify_markdown: Optional[bool] = False
     export_meta: Optional[bool] = False
-    verify_ssl: Optional[bool] = True
+    # verify_ssl: Optional[bool] = True
+
+class HttpConfig(BaseModel):
+    """YAML schema for user provided http settings"""
+    verify_ssl: Optional[bool] = False
+    timeout: Optional[int] = 30
+    backoff_factor: Optional[float] = 2.5
+    retry_codes: Optional[List[int]] = [413, 429, 500, 502, 503, 504]
+    retry_count: Optional[int] = 5
+    additional_headers: Optional[Dict[str, str]] = {}
 
 # pylint: disable=too-few-public-methods
 class UserInput(BaseModel):
     """YAML schema for user provided configuration file"""
     host: str
-    additional_headers: Optional[Dict[str, str]] = None
     credentials: Optional[BookstackAccess] = None
     formats: List[Literal["markdown", "html", "pdf", "plaintext"]]
     output_path: Optional[str] = None
     assets: Optional[Assets] = Assets()
     minio: Optional[ObjectStorageConfig] = None
     keep_last: Optional[int] = None
-    run_interval: Optional[int] = 0
+    run_interval: Optional[int] = 0
+    http_config: Optional[HttpConfig] = HttpConfig()
diff --git a/bookstack_file_exporter/exporter/exporter.py b/bookstack_file_exporter/exporter/exporter.py
@@ -5,7 +5,7 @@
 from requests import Response
 
 from bookstack_file_exporter.exporter.node import Node
-from bookstack_file_exporter.common import util
+from bookstack_file_exporter.common.util import HttpHelper
 
 log = logging.getLogger(__name__)
 
@@ -19,10 +19,9 @@ class NodeExporter():
     Returns:
         NodeExporter instance to handle building shelve/book/chapter/page relations.
     """
-    def __init__(self, api_urls: Dict[str, str], headers: Dict[str,str], verify_ssl: bool):
+    def __init__(self, api_urls: Dict[str, str], http_client: HttpHelper):
         self.api_urls = api_urls
-        self.headers = headers
-        self.verify_ssl = verify_ssl
+        self.http_client = http_client
 
     def get_all_shelves(self) -> Dict[int, Node]:
         """
@@ -38,8 +37,7 @@ def get_all_shelves(self) -> Dict[int, Node]:
 
     def _get_json_response(self, url: str) -> List[Dict[str, Union[str,int]]]:
         """get http response data in json format"""
-        response: Response = util.http_get_request(url=url, headers=self.headers,
-                                        verify_ssl=self.verify_ssl)
+        response: Response = self.http_client.http_get_request(url=url)
         return response.json()
 
     def _get_all_ids(self, url: str) -> List[int]: