Skip to content

Commit fc7df4b

Browse files
committed
[BEAM-12764] Revert "Merge pull request apache#15165 from [BEAM-12593] Verify DataFrame API on pandas 1.3.0"
This reverts commit faac725.
1 parent 76d0dcb commit fc7df4b

File tree

4 files changed

+22
-105
lines changed

4 files changed

+22
-105
lines changed

sdks/python/apache_beam/dataframe/frames.py

+6-15
Original file line numberDiff line numberDiff line change
@@ -55,9 +55,6 @@
5555
'DeferredDataFrame',
5656
]
5757

58-
# Get major, minor version
59-
PD_VERSION = tuple(map(int, pd.__version__.split('.')[0:2]))
60-
6158

6259
def populate_not_implemented(pd_type):
6360
def wrapper(deferred_type):
@@ -1935,7 +1932,7 @@ def value_counts(
19351932
else:
19361933
column = self
19371934

1938-
result = column.groupby(column, dropna=dropna).size()
1935+
result = column.groupby(column).size()
19391936

19401937
# groupby.size() names the index, which we don't need
19411938
result.index.name = None
@@ -2395,8 +2392,8 @@ def aggregate(self, func, axis, *args, **kwargs):
23952392
if func in ('quantile',):
23962393
return getattr(self, func)(*args, axis=axis, **kwargs)
23972394

2398-
# In pandas<1.3.0, maps to a property, args are ignored
2399-
if func in ('size',) and PD_VERSION < (1, 3):
2395+
# Maps to a property, args are ignored
2396+
if func in ('size',):
24002397
return getattr(self, func)
24012398

24022399
# We also have specialized distributed implementations for these. They only
@@ -3395,7 +3392,7 @@ def melt(self, ignore_index, **kwargs):
33953392

33963393
@frame_base.with_docs_from(pd.DataFrame)
33973394
def value_counts(self, subset=None, sort=False, normalize=False,
3398-
ascending=False, dropna=True):
3395+
ascending=False):
33993396
"""``sort`` is ``False`` by default, and ``sort=True`` is not supported
34003397
because it imposes an ordering on the dataset which likely will not be
34013398
preserved."""
@@ -3406,16 +3403,10 @@ def value_counts(self, subset=None, sort=False, normalize=False,
34063403
"ordering on the dataset which likely will not be preserved.",
34073404
reason="order-sensitive")
34083405
columns = subset or list(self.columns)
3409-
3410-
if dropna:
3411-
dropped = self.dropna()
3412-
else:
3413-
dropped = self
3414-
3415-
result = dropped.groupby(columns, dropna=dropna).size()
3406+
result = self.groupby(columns).size()
34163407

34173408
if normalize:
3418-
return result/dropped.length()
3409+
return result/self.dropna().length()
34193410
else:
34203411
return result
34213412

sdks/python/apache_beam/dataframe/frames_test.py

+5-41
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,7 @@
2525
from apache_beam.dataframe import frame_base
2626
from apache_beam.dataframe import frames
2727

28-
# Get major, minor version
29-
PD_VERSION = tuple(map(int, pd.__version__.split('.')[0:2]))
28+
PD_VERSION = tuple(map(int, pd.__version__.split('.')))
3029

3130
GROUPBY_DF = pd.DataFrame({
3231
'group': ['a' if i % 5 == 0 or i % 3 == 0 else 'b' for i in range(100)],
@@ -236,17 +235,6 @@ def test_dataframe_arithmetic(self):
236235
self._run_test(
237236
lambda df, df2: df.subtract(2).multiply(df2).divide(df), df, df2)
238237

239-
@unittest.skipIf(PD_VERSION < (1, 3), "dropna=False is new in pandas 1.3")
240-
def test_value_counts_dropna_false(self):
241-
df = pd.DataFrame({
242-
'first_name': ['John', 'Anne', 'John', 'Beth'],
243-
'middle_name': ['Smith', pd.NA, pd.NA, 'Louise']
244-
})
245-
# TODO(BEAM-12495): Remove the assertRaises this when the underlying bug in
246-
# https://github.com/pandas-dev/pandas/issues/36470 is fixed.
247-
with self.assertRaises(NotImplementedError):
248-
self._run_test(lambda df: df.value_counts(dropna=False), df)
249-
250238
def test_get_column(self):
251239
df = pd.DataFrame({
252240
'Animal': ['Falcon', 'Falcon', 'Parrot', 'Parrot'],
@@ -381,15 +369,10 @@ def test_combine_dataframe_fill(self):
381369
nonparallel=True)
382370

383371
def test_combine_Series(self):
384-
s1 = pd.Series({'falcon': 330.0, 'eagle': 160.0})
385-
s2 = pd.Series({'falcon': 345.0, 'eagle': 200.0, 'duck': 30.0})
386-
self._run_test(
387-
lambda s1,
388-
s2: s1.combine(s2, max),
389-
s1,
390-
s2,
391-
nonparallel=True,
392-
check_proxy=False)
372+
with expressions.allow_non_parallel_operations():
373+
s1 = pd.Series({'falcon': 330.0, 'eagle': 160.0})
374+
s2 = pd.Series({'falcon': 345.0, 'eagle': 200.0, 'duck': 30.0})
375+
self._run_test(lambda s1, s2: s1.combine(s2, max), s1, s2)
393376

394377
def test_combine_first_dataframe(self):
395378
df1 = pd.DataFrame({'A': [None, 0], 'B': [None, 4]})
@@ -604,27 +587,8 @@ def test_value_counts_with_nans(self):
604587
self._run_test(lambda df: df.value_counts(), df)
605588
self._run_test(lambda df: df.value_counts(normalize=True), df)
606589

607-
if PD_VERSION >= (1, 3):
608-
# dropna=False is new in pandas 1.3
609-
# TODO(BEAM-12495): Remove the assertRaises this when the underlying bug
610-
# in https://github.com/pandas-dev/pandas/issues/36470 is fixed.
611-
with self.assertRaises(NotImplementedError):
612-
self._run_test(lambda df: df.value_counts(dropna=False), df)
613-
614-
# Test the defaults.
615590
self._run_test(lambda df: df.num_wings.value_counts(), df)
616591
self._run_test(lambda df: df.num_wings.value_counts(normalize=True), df)
617-
self._run_test(lambda df: df.num_wings.value_counts(dropna=False), df)
618-
619-
# Test the combination interactions.
620-
for normalize in (True, False):
621-
for dropna in (True, False):
622-
self._run_test(
623-
lambda df,
624-
dropna=dropna,
625-
normalize=normalize: df.num_wings.value_counts(
626-
dropna=dropna, normalize=normalize),
627-
df)
628592

629593
def test_value_counts_does_not_support_sort(self):
630594
df = pd.DataFrame({

sdks/python/apache_beam/dataframe/pandas_doctests_test.py

+10-48
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@
2020
import pandas as pd
2121

2222
from apache_beam.dataframe import doctests
23-
from apache_beam.dataframe.frames import PD_VERSION
2423
from apache_beam.dataframe.pandas_top_level_functions import _is_top_level_function
2524

2625

@@ -69,8 +68,7 @@ def test_ndframe_tests(self):
6968
"df.replace(regex={r'^ba.$': 'new', 'foo': 'xyz'})"
7069
],
7170
'pandas.core.generic.NDFrame.fillna': [
72-
'df.fillna(method=\'ffill\')',
73-
'df.fillna(method="ffill")',
71+
"df.fillna(method='ffill')",
7472
'df.fillna(value=values, limit=1)',
7573
],
7674
'pandas.core.generic.NDFrame.sort_values': ['*'],
@@ -166,8 +164,7 @@ def test_dataframe_tests(self):
166164
'pandas.core.frame.DataFrame.cumprod': ['*'],
167165
'pandas.core.frame.DataFrame.diff': ['*'],
168166
'pandas.core.frame.DataFrame.fillna': [
169-
'df.fillna(method=\'ffill\')',
170-
'df.fillna(method="ffill")',
167+
"df.fillna(method='ffill')",
171168
'df.fillna(value=values, limit=1)',
172169
],
173170
'pandas.core.frame.DataFrame.items': ['*'],
@@ -240,17 +237,13 @@ def test_dataframe_tests(self):
240237
# reindex not supported
241238
's2 = s.reindex([1, 0, 2, 3])',
242239
],
243-
'pandas.core.frame.DataFrame.resample': ['*'],
244-
'pandas.core.frame.DataFrame.values': ['*'],
245240
},
246241
not_implemented_ok={
247242
'pandas.core.frame.DataFrame.transform': [
248243
# str arg not supported. Tested with np.sum in
249244
# frames_test.py::DeferredFrameTest::test_groupby_transform_sum
250245
"df.groupby('Date')['Data'].transform('sum')",
251246
],
252-
'pandas.core.frame.DataFrame.swaplevel': ['*'],
253-
'pandas.core.frame.DataFrame.melt': ['*'],
254247
'pandas.core.frame.DataFrame.reindex_axis': ['*'],
255248
'pandas.core.frame.DataFrame.round': [
256249
'df.round(decimals)',
@@ -274,20 +267,13 @@ def test_dataframe_tests(self):
274267
'pandas.core.frame.DataFrame.set_index': [
275268
"df.set_index([s, s**2])",
276269
],
277-
278-
# TODO(BEAM-12495)
279-
'pandas.core.frame.DataFrame.value_counts': [
280-
'df.value_counts(dropna=False)'
281-
],
282270
},
283271
skip={
284272
# s2 created with reindex
285273
'pandas.core.frame.DataFrame.dot': [
286274
'df.dot(s2)',
287275
],
288276

289-
'pandas.core.frame.DataFrame.resample': ['df'],
290-
'pandas.core.frame.DataFrame.asfreq': ['*'],
291277
# Throws NotImplementedError when modifying df
292278
'pandas.core.frame.DataFrame.axes': [
293279
# Returns deferred index.
@@ -316,14 +302,6 @@ def test_dataframe_tests(self):
316302
'pandas.core.frame.DataFrame.to_markdown': ['*'],
317303
'pandas.core.frame.DataFrame.to_parquet': ['*'],
318304

319-
# Raises right exception, but testing framework has matching issues.
320-
# Tested in `frames_test.py`.
321-
'pandas.core.frame.DataFrame.insert': [
322-
'df',
323-
'df.insert(1, "newcol", [99, 99])',
324-
'df.insert(0, "col1", [100, 100], allow_duplicates=True)'
325-
],
326-
327305
'pandas.core.frame.DataFrame.to_records': [
328306
'df.index = df.index.rename("I")',
329307
'index_dtypes = f"<S{df.index.str.len().max()}"', # 1.x
@@ -407,8 +385,7 @@ def test_series_tests(self):
407385
's.dot(arr)', # non-deferred result
408386
],
409387
'pandas.core.series.Series.fillna': [
410-
'df.fillna(method=\'ffill\')',
411-
'df.fillna(method="ffill")',
388+
"df.fillna(method='ffill')",
412389
'df.fillna(value=values, limit=1)',
413390
],
414391
'pandas.core.series.Series.items': ['*'],
@@ -457,11 +434,11 @@ def test_series_tests(self):
457434
's.drop_duplicates()',
458435
"s.drop_duplicates(keep='last')",
459436
],
437+
'pandas.core.series.Series.repeat': [
438+
's.repeat([1, 2, 3])'
439+
],
460440
'pandas.core.series.Series.reindex': ['*'],
461441
'pandas.core.series.Series.autocorr': ['*'],
462-
'pandas.core.series.Series.repeat': ['s.repeat([1, 2, 3])'],
463-
'pandas.core.series.Series.resample': ['*'],
464-
'pandas.core.series.Series': ['ser.iloc[0] = 999'],
465442
},
466443
not_implemented_ok={
467444
'pandas.core.series.Series.transform': [
@@ -474,19 +451,15 @@ def test_series_tests(self):
474451
'ser.groupby(["a", "b", "a", np.nan]).mean()',
475452
'ser.groupby(["a", "b", "a", np.nan], dropna=False).mean()',
476453
],
477-
'pandas.core.series.Series.swaplevel' :['*']
478454
},
479455
skip={
480-
# Relies on setting values with iloc
481-
'pandas.core.series.Series': ['ser', 'r'],
482456
'pandas.core.series.Series.groupby': [
483457
# TODO(BEAM-11393): This example requires aligning two series
484458
# with non-unique indexes. It only works in pandas because
485459
# pandas can recognize the indexes are identical and elide the
486460
# alignment.
487461
'ser.groupby(ser > 100).mean()',
488462
],
489-
'pandas.core.series.Series.asfreq': ['*'],
490463
# error formatting
491464
'pandas.core.series.Series.append': [
492465
's1.append(s2, verify_integrity=True)',
@@ -518,12 +491,12 @@ def test_series_tests(self):
518491
# Inspection after modification.
519492
's'
520493
],
521-
'pandas.core.series.Series.resample': ['df'],
522494
})
523495
self.assertEqual(result.failed, 0)
524496

525497
def test_string_tests(self):
526-
if PD_VERSION < (1, 2):
498+
PD_VERSION = tuple(int(v) for v in pd.__version__.split('.'))
499+
if PD_VERSION < (1, 2, 0):
527500
module = pd.core.strings
528501
else:
529502
# Definitions were moved to accessor in pandas 1.2.0
@@ -695,13 +668,11 @@ def test_groupby_tests(self):
695668
'pandas.core.groupby.generic.SeriesGroupBy.diff': ['*'],
696669
'pandas.core.groupby.generic.DataFrameGroupBy.hist': ['*'],
697670
'pandas.core.groupby.generic.DataFrameGroupBy.fillna': [
698-
'df.fillna(method=\'ffill\')',
699-
'df.fillna(method="ffill")',
671+
"df.fillna(method='ffill')",
700672
'df.fillna(value=values, limit=1)',
701673
],
702674
'pandas.core.groupby.generic.SeriesGroupBy.fillna': [
703-
'df.fillna(method=\'ffill\')',
704-
'df.fillna(method="ffill")',
675+
"df.fillna(method='ffill')",
705676
'df.fillna(value=values, limit=1)',
706677
],
707678
},
@@ -711,7 +682,6 @@ def test_groupby_tests(self):
711682
'pandas.core.groupby.generic.SeriesGroupBy.transform': ['*'],
712683
'pandas.core.groupby.generic.SeriesGroupBy.idxmax': ['*'],
713684
'pandas.core.groupby.generic.SeriesGroupBy.idxmin': ['*'],
714-
'pandas.core.groupby.generic.SeriesGroupBy.apply': ['*'],
715685
},
716686
skip={
717687
'pandas.core.groupby.generic.SeriesGroupBy.cov': [
@@ -728,14 +698,6 @@ def test_groupby_tests(self):
728698
# These examples rely on grouping by a list
729699
'pandas.core.groupby.generic.SeriesGroupBy.aggregate': ['*'],
730700
'pandas.core.groupby.generic.DataFrameGroupBy.aggregate': ['*'],
731-
'pandas.core.groupby.generic.SeriesGroupBy.transform': [
732-
# Dropping invalid columns during a transform is unsupported.
733-
'grouped.transform(lambda x: (x - x.mean()) / x.std())'
734-
],
735-
'pandas.core.groupby.generic.DataFrameGroupBy.transform': [
736-
# Dropping invalid columns during a transform is unsupported.
737-
'grouped.transform(lambda x: (x - x.mean()) / x.std())'
738-
],
739701
})
740702
self.assertEqual(result.failed, 0)
741703

sdks/python/setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,7 @@ def get_version():
166166
REQUIRED_TEST_PACKAGES = [
167167
'freezegun>=0.3.12',
168168
'mock>=1.0.1,<3.0.0',
169-
'pandas>=1.0,<1.4.0',
169+
'pandas>=1.0,<1.3.0',
170170
'parameterized>=0.7.1,<0.8.0',
171171
'pyhamcrest>=1.9,!=1.10.0,<2.0.0',
172172
'pyyaml>=3.12,<6.0.0',

0 commit comments

Comments
 (0)