Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions bigframes/blob/_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -473,7 +473,9 @@ def pdf_extract_func(src_obj_ref_rt: str) -> str:
return result_json


pdf_extract_def = FunctionDef(pdf_extract_func, ["pypdf", "requests", "pypdf[crypto]"])
pdf_extract_def = FunctionDef(
pdf_extract_func, ["pypdf>=5.3.1,<6.0.0", "requests", "cryptography==43.0.3"]
)


# Extracts text from a PDF url and chunks it simultaneously
Expand Down Expand Up @@ -527,4 +529,6 @@ def pdf_chunk_func(src_obj_ref_rt: str, chunk_size: int, overlap_size: int) -> s
return result_json


pdf_chunk_def = FunctionDef(pdf_chunk_func, ["pypdf", "requests", "pypdf[crypto]"])
pdf_chunk_def = FunctionDef(
pdf_chunk_func, ["pypdf>=5.3.1,<6.0.0", "requests", "cryptography==43.0.3"]
)
127 changes: 62 additions & 65 deletions tests/system/large/blob/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -302,37 +302,16 @@ def test_blob_image_normalize_to_bq(images_mm_df: bpd.DataFrame, bq_connection:


@pytest.mark.parametrize(
"verbose, expected",
"verbose",
[
(
True,
pd.Series(
[
{"status": "File has not been decrypted", "content": ""},
{
"status": "",
"content": "Sample PDF This is a testing file. Some dummy messages are used for testing purposes. ",
},
]
),
),
(
False,
pd.Series(
[
"",
"Sample PDF This is a testing file. Some dummy messages are used for testing purposes. ",
],
name="pdf",
),
),
(True),
(False),
],
)
def test_blob_pdf_extract(
pdf_mm_df: bpd.DataFrame,
verbose: bool,
bq_connection: str,
expected: pd.Series,
):
actual = (
pdf_mm_df["pdf"]
Expand All @@ -341,49 +320,44 @@ def test_blob_pdf_extract(
.to_pandas()
)

pd.testing.assert_series_equal(
actual,
expected,
check_dtype=False,
check_index=False,
# check relative length
expected_text = "Sample PDF This is a testing file. Some dummy messages are used for testing purposes."
expected_len = len(expected_text)

actual_text = ""
if verbose:
# The first entry is for a file that doesn't exist, so we check the second one
successful_results = actual[actual.apply(lambda x: x["status"] == "")]
actual_text = successful_results.apply(lambda x: x["content"]).iloc[0]
else:
actual_text = actual[actual != ""].iloc[0]
actual_len = len(actual_text)

relative_length_tolerance = 0.25
min_acceptable_len = expected_len * (1 - relative_length_tolerance)
max_acceptable_len = expected_len * (1 + relative_length_tolerance)
assert min_acceptable_len <= actual_len <= max_acceptable_len, (
f"Item (verbose={verbose}): Extracted text length {actual_len} is outside the acceptable range "
f"[{min_acceptable_len:.0f}, {max_acceptable_len:.0f}]. "
f"Expected reference length was {expected_len}. "
)

# check for major keywords
major_keywords = ["Sample", "PDF", "testing", "dummy", "messages"]
for keyword in major_keywords:
assert (
keyword.lower() in actual_text.lower()
), f"Item (verbose={verbose}): Expected keyword '{keyword}' not found in extracted text. "


@pytest.mark.parametrize(
"verbose, expected",
"verbose",
[
(
True,
pd.Series(
[
{"status": "File has not been decrypted", "content": []},
{
"status": "",
"content": [
"Sample PDF This is a testing file. Some ",
"dummy messages are used for testing ",
"purposes. ",
],
},
]
),
),
(
False,
pd.Series(
[
pd.NA,
"Sample PDF This is a testing file. Some ",
"dummy messages are used for testing ",
"purposes. ",
],
),
),
(True),
(False),
],
)
def test_blob_pdf_chunk(
pdf_mm_df: bpd.DataFrame, verbose: bool, bq_connection: str, expected: pd.Series
):
def test_blob_pdf_chunk(pdf_mm_df: bpd.DataFrame, verbose: bool, bq_connection: str):
actual = (
pdf_mm_df["pdf"]
.blob.pdf_chunk(
Expand All @@ -397,13 +371,36 @@ def test_blob_pdf_chunk(
.to_pandas()
)

pd.testing.assert_series_equal(
actual,
expected,
check_dtype=False,
check_index=False,
# check relative length
expected_text = "Sample PDF This is a testing file. Some dummy messages are used for testing purposes."
expected_len = len(expected_text)

actual_text = ""
if verbose:
# The first entry is for a file that doesn't exist, so we check the second one
successful_results = actual[actual.apply(lambda x: x["status"] == "")]
actual_text = "".join(successful_results.apply(lambda x: x["content"]).iloc[0])
else:
# First entry is NA
actual_text = "".join(actual.dropna())
actual_len = len(actual_text)

relative_length_tolerance = 0.25
min_acceptable_len = expected_len * (1 - relative_length_tolerance)
max_acceptable_len = expected_len * (1 + relative_length_tolerance)
assert min_acceptable_len <= actual_len <= max_acceptable_len, (
f"Item (verbose={verbose}): Extracted text length {actual_len} is outside the acceptable range "
f"[{min_acceptable_len:.0f}, {max_acceptable_len:.0f}]. "
f"Expected reference length was {expected_len}. "
)

# check for major keywords
major_keywords = ["Sample", "PDF", "testing", "dummy", "messages"]
for keyword in major_keywords:
assert (
keyword.lower() in actual_text.lower()
), f"Item (verbose={verbose}): Expected keyword '{keyword}' not found in extracted text. "


@pytest.mark.parametrize(
"model_name, verbose",
Expand Down