From 1d74c8919e0def9bb6e2fe41c3fac5c963f9cc3e Mon Sep 17 00:00:00 2001
From: Holt Skinner <13262395+holtskinner@users.noreply.github.com>
Date: Thu, 13 Jun 2024 13:22:50 -0500
Subject: [PATCH 1/3] Update README.rst for Python 3.7 (#298)

---
 README.rst | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.rst b/README.rst
index fcd6a019..64ead15b 100644
--- a/README.rst
+++ b/README.rst
@@ -68,6 +68,10 @@ Python >= 3.8
 .. _active: https://devguide.python.org/devcycle/#in-development-main-branch
 .. _maintenance: https://devguide.python.org/devcycle/#maintenance-branches
 
+Unsupported Python Versions
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Python <= 3.7
+
 **NOTE**:
 Python 3.7 was marked as `unsupported`_ by the python community in June 2023.
 We recommend that all developers upgrade to Python 3.8 and newer as soon as
@@ -78,10 +82,6 @@ newer.
 
 .. _unsupported: https://devguide.python.org/versions/#unsupported-versions
 
-Unsupported Python Versions
-^^^^^^^^^^^^^^^^^^^^^^^^^^^
-Python <= 3.6
-
 If you are using an `end-of-life`_
 version of Python, we recommend that you update as soon as possible to an actively supported version.
 

From bee4f62f82ace5342c92dddb1e9acc17f1a27fe1 Mon Sep 17 00:00:00 2001
From: Holt Skinner <13262395+holtskinner@users.noreply.github.com>
Date: Tue, 2 Jul 2024 15:45:16 -0500
Subject: [PATCH 2/3] fix: Refactor page.py to improve performance and
 organization (#316)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes #312 🦕

Improves upon the hOCR processing improvements made in #313
---
 .../cloud/documentai_toolbox/wrappers/page.py | 245 +++++++++---------
 1 file changed, 122 insertions(+), 123 deletions(-)

diff --git a/google/cloud/documentai_toolbox/wrappers/page.py b/google/cloud/documentai_toolbox/wrappers/page.py
index d224d1de..35a2491e 100644
--- a/google/cloud/documentai_toolbox/wrappers/page.py
+++ b/google/cloud/documentai_toolbox/wrappers/page.py
@@ -18,7 +18,7 @@
 from abc import ABC
 import dataclasses
 from functools import cached_property
-from typing import List, Optional, Type, cast
+from typing import Iterable, List, Optional, Type
 
 import pandas as pd
 
@@ -44,37 +44,55 @@ class Table:
     _page: "Page" = dataclasses.field(repr=False)
 
     @cached_property
-    def body_rows(self):
-        return _table_rows_from_documentai_table_rows(
-            table_rows=list(self.documentai_object.body_rows),
-            text=self._page._document_text,
-        )
+    def body_rows(self) -> List[List[str]]:
+        return self._extract_table_rows(self.documentai_object.body_rows)
 
     @cached_property
-    def header_rows(self):
-        return _table_rows_from_documentai_table_rows(
-            table_rows=list(self.documentai_object.header_rows),
-            text=self._page._document_text,
-        )
+    def header_rows(self) -> List[List[str]]:
+        return self._extract_table_rows(self.documentai_object.header_rows)
 
     def to_dataframe(self) -> pd.DataFrame:
-        r"""Returns pd.DataFrame from documentai.table
+        """Returns pd.DataFrame from documentai.table
 
         Returns:
             pd.DataFrame:
                 The DataFrame of the table.
-
         """
         if not self.body_rows:
             return pd.DataFrame(columns=self.header_rows)
 
-        if self.header_rows:
-            columns = pd.MultiIndex.from_arrays(self.header_rows)
-        else:
-            columns = [None] * len(self.body_rows[0])
+        columns = (
+            pd.MultiIndex.from_arrays(self.header_rows)
+            if self.header_rows
+            else [None] * len(self.body_rows[0])
+        )
 
         return pd.DataFrame(self.body_rows, columns=columns)
 
+    def _extract_table_rows(
+        self, table_rows: Iterable[documentai.Document.Page.Table.TableRow]
+    ) -> List[List[str]]:
+        """Returns a list of rows from table_rows.
+
+        Args:
+            table_rows (List[documentai.Document.Page.Table.TableRow]):
+                Required. A documentai.Document.Page.Table.TableRow.
+
+        Returns:
+            List[List[str]]:
+                A list of table rows.
+        """
+        return [
+            [
+                # Newlines removed to improve formatting for export formats.
+                _text_from_layout(cell.layout, self._page._document_text).replace(
+                    "\n", ""
+                )
+                for cell in row.cells
+            ]
+            for row in table_rows
+        ]
+
 
 @dataclasses.dataclass
 class FormField:
@@ -95,7 +113,7 @@ class FormField:
     _page: "Page" = dataclasses.field(repr=False)
 
     @cached_property
-    def field_name(self):
+    def field_name(self) -> str:
         return _trim_text(
             _text_from_layout(
                 self.documentai_object.field_name, self._page._document_text
@@ -103,7 +121,7 @@ def field_name(self):
         )
 
     @cached_property
-    def field_value(self):
+    def field_value(self) -> str:
         return _trim_text(
             _text_from_layout(
                 self.documentai_object.field_value, self._page._document_text
@@ -111,6 +129,23 @@ def field_value(self):
         )
 
 
+def _trim_text(text: str) -> str:
+    """Remove extra space characters from text (blank, newline, tab, etc.)
+
+    Args:
+        text (str):
+            Required. UTF-8 encoded text in reading order
+            from the document.
+
+    Returns:
+        str:
+            Text without trailing spaces/newlines
+    """
+    # Newline replacement added to correct common
+    # misshapen output from Form Parser.
+    return text.strip().replace("\n", " ")
+
+
 @dataclasses.dataclass
 class _BasePageElement(ABC):
     """Base class for representing a wrapped Document AI Page element (Symbol, Token, Line, Paragraph, Block)."""
@@ -119,24 +154,69 @@ class _BasePageElement(ABC):
     _page: "Page" = dataclasses.field(repr=False)
 
     @cached_property
-    def text(self):
+    def text(self) -> str:
         """
         Text of the page element.
         """
         return _text_from_layout(
-            layout=self.documentai_object.layout, text=self._page._document_text
+            self.documentai_object.layout, self._page._document_text
         )
 
     @cached_property
-    def hocr_bounding_box(self):
+    def hocr_bounding_box(self) -> Optional[str]:
         """
         hOCR bounding box of the page element.
         """
         return _get_hocr_bounding_box(
-            element_with_layout=self.documentai_object,
-            page_dimension=self._page.documentai_object.dimension,
+            self.documentai_object, self._page.documentai_object.dimension
         )
 
+    # This field is a cached property to improve export times for hOCR
+    # as outlined in https://github.com/googleapis/python-documentai-toolbox/issues/312
+    @cached_property
+    def _text_segment(self) -> documentai.Document.TextAnchor.TextSegment:
+        """
+        Page element text segment.
+        """
+        return self.documentai_object.layout.text_anchor.text_segments[0]
+
+    def _get_children_of_element(
+        self, potential_children: List["_BasePageElement"]
+    ) -> List["_BasePageElement"]:
+        """
+        Filters potential child elements to identify only those fully contained within this element.
+
+        This method iterates through a list of potential child elements, checking if their
+        start and end indices fall completely within the start and end indices of this element.
+        Elements that are only partially contained or entirely outside this element's range are excluded.
+
+        Args:
+            potential_children (List[_BasePageElement]):
+                Required. A list of wrapped page elements (e.g., words, lines, paragraphs)
+                that could potentially be children of this element.
+
+        Returns:
+            List[_BasePageElement]:
+                A new list containing only the wrapped page elements that are fully
+                contained within this element, maintaining their original order.
+        """
+        start_index = self._text_segment.start_index
+        end_index = self._text_segment.end_index
+
+        children = []
+        for child in potential_children:
+            child_start_index = child._text_segment.start_index
+            child_end_index = child._text_segment.end_index
+
+            if child_start_index >= end_index:
+                break  # Optimization: stop early if child is beyond the end of this element
+            if (
+                start_index <= child_start_index < end_index
+                and start_index < child_end_index <= end_index
+            ):
+                children.append(child)
+        return children
+
 
 @dataclasses.dataclass
 class Symbol(_BasePageElement):
@@ -151,7 +231,7 @@ class Symbol(_BasePageElement):
     """
 
     @cached_property
-    def hocr_bounding_box(self):
+    def hocr_bounding_box(self) -> Optional[str]:
         # Symbols are not represented in hOCR
         return None
 
@@ -170,11 +250,8 @@ class Token(_BasePageElement):
     """
 
     @cached_property
-    def symbols(self):
-        return cast(
-            List[Symbol],
-            _get_children_of_element(self.documentai_object, self._page.symbols),
-        )
+    def symbols(self) -> List[Symbol]:
+        return self._get_children_of_element(self._page.symbols)
 
 
 @dataclasses.dataclass
@@ -186,16 +263,13 @@ class Line(_BasePageElement):
             Required. The original object.
         text (str):
             Required. The text of the Line.
-        _tokens (List[Token]):
+        tokens (List[Token]):
             Optional. The Tokens contained within the Line.
     """
 
     @cached_property
-    def tokens(self):
-        return cast(
-            List[Token],
-            _get_children_of_element(self.documentai_object, self._page.tokens),
-        )
+    def tokens(self) -> List[Token]:
+        return self._get_children_of_element(self._page.tokens)
 
 
 @dataclasses.dataclass
@@ -207,20 +281,13 @@ class Paragraph(_BasePageElement):
             Required. The original object.
         text (str):
             Required. The text of the Paragraph.
-        _lines (List[Line]):
+        lines (List[Line]):
             Optional. The Lines contained within the Paragraph.
     """
 
-    _lines: Optional[List[Line]] = dataclasses.field(
-        init=False, repr=False, default=None
-    )
-
     @cached_property
-    def lines(self):
-        return cast(
-            List[Line],
-            _get_children_of_element(self.documentai_object, self._page.lines),
-        )
+    def lines(self) -> List[Line]:
+        return self._get_children_of_element(self._page.lines)
 
 
 @dataclasses.dataclass
@@ -232,16 +299,13 @@ class Block(_BasePageElement):
             Required. The original object.
         text (str):
             Required. The text of the Block.
-        _paragraphs (List[Paragraph]):
+        paragraphs (List[Paragraph]):
             Optional. The Paragraphs contained within the Block.
     """
 
     @cached_property
-    def paragraphs(self):
-        return cast(
-            List[Paragraph],
-            _get_children_of_element(self.documentai_object, self._page.paragraphs),
-        )
+    def paragraphs(self) -> List[Paragraph]:
+        return self._get_children_of_element(self._page.paragraphs)
 
 
 @dataclasses.dataclass
@@ -262,33 +326,11 @@ def hocr_bounding_box(self):
         return None
 
 
-def _table_rows_from_documentai_table_rows(
-    table_rows: List[documentai.Document.Page.Table.TableRow], text: str
-) -> List[List[str]]:
-    r"""Returns a list of rows from table_rows.
-
-    Args:
-        table_rows (List[documentai.Document.Page.Table.TableRow]):
-            Required. A documentai.Document.Page.Table.TableRow.
-        text (str):
-            Required. UTF-8 encoded text in reading order
-            from the document.
-
-    Returns:
-        List[List[str]]:
-            A list of table rows.
-    """
-    return [
-        [_text_from_layout(cell.layout, text).replace("\n", "") for cell in row.cells]
-        for row in table_rows
-    ]
-
-
 def _get_hocr_bounding_box(
     element_with_layout: ElementWithLayout,
     page_dimension: documentai.Document.Page.Dimension,
 ) -> Optional[str]:
-    r"""Returns a hOCR bounding box string.
+    """Returns a hOCR bounding box string.
 
     Args:
         element_with_layout (ElementWithLayout):
@@ -298,7 +340,7 @@ def _get_hocr_bounding_box(
 
     Returns:
         Optional[str]:
-            hOCR bounding box sring.
+            hOCR bounding box string.
     """
     if not element_with_layout.layout.bounding_poly:
         return None
@@ -320,7 +362,7 @@ def _text_from_layout(layout: documentai.Document.Page.Layout, text: str) -> str
 
     Args:
         layout (documentai.Document.Page.Layout):
-            Required. an element with layout fields.
+            Required. An element with layout fields.
         text (str):
             Required. UTF-8 encoded text in reading order
             of the `documentai.Document` containing the layout element.
@@ -329,6 +371,8 @@ def _text_from_layout(layout: documentai.Document.Page.Layout, text: str) -> str
         str:
             Text from a single element.
     """
+    if not layout.text_anchor or not layout.text_anchor.text_segments:
+        return ""
 
     # Note: `layout.text_anchor.text_segments` are indexes into the full Document text.
     # https://cloud.google.com/document-ai/docs/reference/rest/v1/Document#textsegment
@@ -338,50 +382,6 @@ def _text_from_layout(layout: documentai.Document.Page.Layout, text: str) -> str
     )
 
 
-def _get_children_of_element(
-    element: ElementWithLayout, children: List[ElementWithLayout]
-) -> List[ElementWithLayout]:
-    r"""Returns a list of children inside element.
-
-    Args:
-        element (ElementWithLayout):
-            Required. A element in a page.
-        children (List[ElementWithLayout]):
-            Required. List of wrapped children.
-
-    Returns:
-        List[ElementWithLayout]:
-            A list of wrapped children that are inside a element.
-    """
-    start_index = element.layout.text_anchor.text_segments[0].start_index
-    end_index = element.layout.text_anchor.text_segments[0].end_index
-
-    return [
-        child
-        for child in children
-        if start_index
-        <= child.documentai_object.layout.text_anchor.text_segments[0].start_index
-        < end_index
-        and start_index
-        < child.documentai_object.layout.text_anchor.text_segments[0].end_index
-        <= end_index
-    ]
-
-
-def _trim_text(text: str) -> str:
-    r"""Remove extra space characters from text (blank, newline, tab, etc.)
-
-    Args:
-        text (str):
-            Required. UTF-8 encoded text in reading order
-            from the document.
-    Returns:
-        str:
-            Text without trailing spaces/newlines
-    """
-    return text.strip().replace("\n", " ")
-
-
 @dataclasses.dataclass
 class Page:
     """Represents a wrapped documentai.Document.Page .
@@ -485,6 +485,5 @@ def blocks(self):
     @cached_property
     def hocr_bounding_box(self):
         return _get_hocr_bounding_box(
-            element_with_layout=self.documentai_object,
-            page_dimension=self.documentai_object.dimension,
+            self.documentai_object, self.documentai_object.dimension
         )

From 3fcb06b16d176269e85924c86ec49a6f8758e59f Mon Sep 17 00:00:00 2001
From: "release-please[bot]"
 <55107282+release-please[bot]@users.noreply.github.com>
Date: Tue, 2 Jul 2024 14:04:25 -0700
Subject: [PATCH 3/3] chore(main): release 0.13.5-alpha (#319)

Co-authored-by: release-please[bot] <55107282+release-please[bot]@users.noreply.github.com>
---
 CHANGELOG.md                               | 7 +++++++
 google/cloud/documentai_toolbox/version.py | 2 +-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 64e5ba41..4d8415b5 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,12 @@
 # Changelog
 
+## [0.13.5-alpha](https://github.com/googleapis/python-documentai-toolbox/compare/v0.13.4-alpha...v0.13.5-alpha) (2024-07-02)
+
+
+### Bug Fixes
+
+* Refactor page.py to improve performance and organization ([#316](https://github.com/googleapis/python-documentai-toolbox/issues/316)) ([bee4f62](https://github.com/googleapis/python-documentai-toolbox/commit/bee4f62f82ace5342c92dddb1e9acc17f1a27fe1))
+
 ## [0.13.4-alpha](https://github.com/googleapis/python-documentai-toolbox/compare/v0.13.3-alpha...v0.13.4-alpha) (2024-06-13)
 
 
diff --git a/google/cloud/documentai_toolbox/version.py b/google/cloud/documentai_toolbox/version.py
index 7e2ef9e9..05a8bc38 100644
--- a/google/cloud/documentai_toolbox/version.py
+++ b/google/cloud/documentai_toolbox/version.py
@@ -13,4 +13,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-__version__ = "0.13.4-alpha"
+__version__ = "0.13.5-alpha"