fix(table): fix _Row.cells can raise IndexError

scanny · scanny · commit f4a48b5565a3 · 2024-04-29T16:06:02.000-07:00
The original implementation of `_Row.cells` did not take into account
the fact that rows could include unoccupied grid cells at the beginning
and/or end of the row.

This "advanced" feature of tables is sometimes used by the Word table
layout algorithm when the user does not carefully align the right
boundary of cells during resizing, so while quite unusual to be used on
purpose, this arises with some frequency in human-authored documents in
the wild.

The prior implementation of `_Row.cells` assumed that `Table.cells()`
was uniform and the cells for a row could be reliably be computed from
the table column-count and row and column offsets. That assumption
does not always hold and can raise `IndexError` when omitted cells are
present.

This reimplementation remedies that situation. As a side-effect it
should also perform much better when reading large tables.
diff --git a/src/docx/table.py b/src/docx/table.py
@@ -2,7 +2,7 @@
 
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, cast, overload
+from typing import TYPE_CHECKING, Iterator, cast, overload
 
 from typing_extensions import TypeAlias
 
@@ -102,7 +102,10 @@ def columns(self):
         return _Columns(self._tbl, self)
 
     def row_cells(self, row_idx: int) -> list[_Cell]:
-        """Sequence of cells in the row at `row_idx` in this table."""
+        """DEPRECATED: Use `table.rows[row_idx].cells` instead.
+
+        Sequence of cells in the row at `row_idx` in this table.
+        """
         column_count = self._column_count
         start = row_idx * column_count
         end = start + column_count
@@ -403,7 +406,36 @@ def cells(self) -> tuple[_Cell, ...]:
           layout-grid positions using `.grid_cols_before` and `.grid_cols_after`.
 
         """
-        return tuple(self.table.row_cells(self._index))
+
+        def iter_tc_cells(tc: CT_Tc) -> Iterator[_Cell]:
+            """Generate a cell object for each layout-grid cell in `tc`.
+
+            In particular, a `<w:tc>` element with a horizontal "span" with generate the same cell
+            multiple times, one for each grid-cell being spanned. This approximates a row in a
+            "uniform" table, where each row has a cell for each column in the table.
+            """
+            # -- a cell comprising the second or later row of a vertical span is indicated by
+            # -- tc.vMerge="continue" (the default value of the `w:vMerge` attribute, when it is
+            # -- present in the XML). The `w:tc` element at the same grid-offset in the prior row
+            # -- is guaranteed to be the same width (gridSpan). So we can delegate content
+            # -- discovery to that prior-row `w:tc` element (recursively) until we arrive at the
+            # -- "root" cell -- for the vertical span.
+            if tc.vMerge == "continue":
+                yield from iter_tc_cells(tc._tc_above)  # pyright: ignore[reportPrivateUsage]
+                return
+
+            # -- Otherwise, vMerge is either "restart" or None, meaning this `tc` holds the actual
+            # -- content of the cell (whether it is vertically merged or not).
+            cell = _Cell(tc, self.table)
+            for _ in range(tc.grid_span):
+                yield cell
+
+        def _iter_row_cells() -> Iterator[_Cell]:
+            """Generate `_Cell` instance for each populated layout-grid cell in this row."""
+            for tc in self._tr.tc_lst:
+                yield from iter_tc_cells(tc)
+
+        return tuple(_iter_row_cells())
 
     @property
     def grid_cols_after(self) -> int:
diff --git a/tests/test_table.py b/tests/test_table.py
@@ -782,18 +782,41 @@ def it_can_change_its_height_rule(
         row.height_rule = new_value
         assert row._tr.xml == xml(expected_cxml)
 
+    @pytest.mark.parametrize(
+        ("tbl_cxml", "row_idx", "expected_len"),
+        [
+            # -- cell corresponds to single layout-grid cell --
+            ("w:tbl/w:tr/w:tc/w:p", 0, 1),
+            # -- cell has a horizontal span --
+            ("w:tbl/w:tr/w:tc/(w:tcPr/w:gridSpan{w:val=2},w:p)", 0, 2),
+            # -- cell is in latter row of vertical span --
+            (
+                "w:tbl/(w:tr/w:tc/(w:tcPr/w:vMerge{w:val=restart},w:p),"
+                "w:tr/w:tc/(w:tcPr/w:vMerge,w:p))",
+                1,
+                1,
+            ),
+            # -- cell both has horizontal span and is latter row of vertical span --
+            (
+                "w:tbl/(w:tr/w:tc/(w:tcPr/(w:gridSpan{w:val=2},w:vMerge{w:val=restart}),w:p),"
+                "w:tr/w:tc/(w:tcPr/(w:gridSpan{w:val=2},w:vMerge),w:p))",
+                1,
+                2,
+            ),
+        ],
+    )
     def it_provides_access_to_its_cells(
-        self, _index_prop_: Mock, table_prop_: Mock, table_: Mock, parent_: Mock
+        self, tbl_cxml: str, row_idx: int, expected_len: int, parent_: Mock
     ):
-        row = _Row(cast(CT_Row, element("w:tr")), parent_)
-        _index_prop_.return_value = row_idx = 6
-        expected_cells = (1, 2, 3)
-        table_.row_cells.return_value = list(expected_cells)
+        tbl = cast(CT_Tbl, element(tbl_cxml))
+        tr = tbl.tr_lst[row_idx]
+        table = Table(tbl, parent_)
+        row = _Row(tr, table)
 
         cells = row.cells
 
-        table_.row_cells.assert_called_once_with(row_idx)
-        assert cells == expected_cells
+        assert len(cells) == expected_len
+        assert all(type(c) is _Cell for c in cells)
 
     def it_provides_access_to_the_table_it_belongs_to(self, parent_: Mock, table_: Mock):
         parent_.table = table_
@@ -821,7 +844,7 @@ def table_(self, request: FixtureRequest):
 
     @pytest.fixture
     def table_prop_(self, request: FixtureRequest, table_: Mock):
-        return property_mock(request, _Row, "table", return_value=table_)
+        return property_mock(request, _Row, "table")
 
 
 class Describe_Rows: