From 55b846ed95afa64419e89945beb5a8d530e305be Mon Sep 17 00:00:00 2001 From: Trevor Bergeron Date: Wed, 14 May 2025 23:41:05 +0000 Subject: [PATCH] fix: Support str.replace re.compile with flags --- bigframes/operations/strings.py | 28 +++++++++++++------ tests/system/small/operations/test_strings.py | 1 + 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/bigframes/operations/strings.py b/bigframes/operations/strings.py index a8430b0b0e..9022a1665e 100644 --- a/bigframes/operations/strings.py +++ b/bigframes/operations/strings.py @@ -15,7 +15,7 @@ from __future__ import annotations import re -from typing import cast, Literal, Optional, Union +from typing import Literal, Optional, Union import bigframes_vendored.constants as constants import bigframes_vendored.pandas.core.strings.accessor as vendorstr @@ -230,21 +230,26 @@ def replace( flags: int = 0, regex: bool = False, ) -> series.Series: - is_compiled = isinstance(pat, re.Pattern) - patstr = cast(str, pat.pattern if is_compiled else pat) # type: ignore + if isinstance(pat, re.Pattern): + assert isinstance(pat.pattern, str) + pat_str = pat.pattern + flags = pat.flags | flags + else: + pat_str = pat + if case is False: - return self.replace(pat, repl, flags=flags | re.IGNORECASE, regex=True) + return self.replace(pat_str, repl, flags=flags | re.IGNORECASE, regex=True) if regex: re2flags = _parse_flags(flags) if re2flags: - patstr = re2flags + patstr - return self._apply_unary_op(ops.RegexReplaceStrOp(pat=patstr, repl=repl)) + pat_str = re2flags + pat_str + return self._apply_unary_op(ops.RegexReplaceStrOp(pat=pat_str, repl=repl)) else: - if is_compiled: + if isinstance(pat, re.Pattern): raise ValueError( "Must set 'regex'=True if using compiled regex pattern." ) - return self._apply_unary_op(ops.ReplaceStrOp(pat=patstr, repl=repl)) + return self._apply_unary_op(ops.ReplaceStrOp(pat=pat_str, repl=repl)) def startswith( self, @@ -318,10 +323,15 @@ def to_blob(self, connection: Optional[str] = None) -> series.Series: def _parse_flags(flags: int) -> Optional[str]: re2flags = [] for reflag, re2flag in REGEXP_FLAGS.items(): - if flags & flags: + if flags & reflag: re2flags.append(re2flag) flags = flags ^ reflag + # re2 handles unicode fine by default + # most compiled re in python will have unicode set + if re.U and flags: + flags = flags ^ re.U + # Remaining flags couldn't be mapped to re2 engine if flags: raise NotImplementedError( diff --git a/tests/system/small/operations/test_strings.py b/tests/system/small/operations/test_strings.py index e4824875b4..032d93c19d 100644 --- a/tests/system/small/operations/test_strings.py +++ b/tests/system/small/operations/test_strings.py @@ -98,6 +98,7 @@ def test_str_extract(scalars_dfs, pat): (re.compile("(?i).e.."), "blah", None, 0, True), ("H", "h", True, 0, False), (", ", "__", True, 0, False), + (re.compile(r"hEllo", flags=re.I), "blah", None, 0, True), ], ) def test_str_replace(scalars_dfs, pat, repl, case, flags, regex):