From 52f312c5bc46d898832b3cf88db213a064363772 Mon Sep 17 00:00:00 2001
From: Holt Skinner <holtskinner@google.com>
Date: Fri, 23 Feb 2024 12:17:43 -0600
Subject: [PATCH] add fitz test

---
 .../documentai_toolbox/wrappers/document.py   | 86 +++++++++++++++++++
 1 file changed, 86 insertions(+)

diff --git a/google/cloud/documentai_toolbox/wrappers/document.py b/google/cloud/documentai_toolbox/wrappers/document.py
index 6df97312..b14c2880 100644
--- a/google/cloud/documentai_toolbox/wrappers/document.py
+++ b/google/cloud/documentai_toolbox/wrappers/document.py
@@ -20,6 +20,7 @@
 import glob
 import os
 import re
+import fitz
 from typing import Dict, List, Optional, Type, Union
 
 from google.api_core.operation import from_gapic as operation_from_gapic
@@ -753,6 +754,91 @@ def split_pdf(self, pdf_path: str, output_path: str) -> List[str]:
                 output_files.append(output_filename)
         return output_files
 
+    def split_pdf_mupdf(self, pdf_path: str, output_path: str) -> List[str]:
+        """
+        Splits local PDF file into multiple PDF files based on output from a Splitter/Classifier processor.
+
+        Args:
+            pdf_path (str):
+                Required. The path to the PDF file.
+            output_path (str):
+                Required. The path to the output directory.
+        Returns:
+            List[str]:
+                A list of output pdf files.
+        """
+        output_files: List[str] = []
+        input_filename, input_extension = os.path.splitext(os.path.basename(pdf_path))
+
+        pdf_document = fitz.open(pdf_path)
+
+        for entity in self.entities:
+            subdoc_type = entity.type_ or "subdoc"
+            page_range = (
+                f"pg{entity.start_page + 1}"
+                if entity.start_page == entity.end_page
+                else f"pg{entity.start_page + 1}-{entity.end_page + 1}"
+            )
+            output_filename = (
+                f"{input_filename}_{page_range}_{subdoc_type}{input_extension}"
+            )
+
+            subdoc = fitz.open()
+            subdoc.insert_pdf(
+                pdf_document, from_page=entity.start_page, to_page=entity.end_page
+            )
+            subdoc.save(
+                os.path.join(output_path, output_filename), garbage=3, deflate=True
+            )
+
+            output_files.append(output_filename)
+
+        pdf_document.close()
+
+        return output_files
+
+    def split_pdf_mupdf_optimized(self, pdf_path: str, output_path: str) -> List[str]:
+        """
+        Splits local PDF file into multiple PDF files based on output from a Splitter/Classifier processor.
+
+        Args:
+            pdf_path (str):
+                Required. The path to the PDF file.
+            output_path (str):
+                Required. The path to the output directory.
+        Returns:
+            List[str]:
+                A list of output pdf files.
+        """
+        output_files: List[str] = []
+        input_filename, input_extension = os.path.splitext(os.path.basename(pdf_path))
+
+        with fitz.open(pdf_path) as pdf_document:
+            for entity in self.entities:
+                subdoc_type = entity.type_ or "subdoc"
+                page_range = (
+                    f"pg{entity.start_page + 1}"
+                    if entity.start_page == entity.end_page
+                    else f"pg{entity.start_page + 1}-{entity.end_page + 1}"
+                )
+                output_filename = (
+                    f"{input_filename}_{page_range}_{subdoc_type}{input_extension}"
+                )
+
+                subdoc = fitz.open()
+                subdoc.insert_pdf(
+                    pdf_document,
+                    from_page=entity.start_page,
+                    to_page=entity.end_page,
+                )
+                subdoc.save(
+                    os.path.join(output_path, output_filename), garbage=3, deflate=True
+                )
+
+                output_files.append(output_filename)
+
+        return output_files
+
     def convert_document_to_annotate_file_response(self) -> AnnotateFileResponse:
         r"""Convert OCR data from `Document.proto` to `AnnotateFileResponse.proto` for Vision API.