From 4ca9dee5066fb307eded3e54e0bcca5fd029376e Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Fri, 13 Jul 2018 21:46:11 +0200 Subject: [PATCH 1/7] TST/CLN: series.duplicated; parametrisation; fix warning --- pandas/tests/series/test_analytics.py | 125 +++++++++----------------- 1 file changed, 44 insertions(+), 81 deletions(-) diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index fd14118bd833f..4ef30d5faf86b 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -956,94 +956,56 @@ def test_unique(self): check_dtype=False) @pytest.mark.parametrize( - "tc1, tc2", + 'keep, expected', [ - ( - Series([1, 2, 3, 3], dtype=np.dtype('int_')), - Series([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype('int_')) - ), - ( - Series([1, 2, 3, 3], dtype=np.dtype('uint')), - Series([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype('uint')) - ), - ( - Series([1, 2, 3, 3], dtype=np.dtype('float_')), - Series([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype('float_')) - ), - ( - Series([1, 2, 3, 3], dtype=np.dtype('unicode_')), - Series([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype('unicode_')) - ) - ] - ) - def test_drop_duplicates_non_bool(self, tc1, tc2): - # Test case 1 - expected = Series([False, False, False, True]) - assert_series_equal(tc1.duplicated(), expected) - assert_series_equal(tc1.drop_duplicates(), tc1[~expected]) - sc = tc1.copy() - sc.drop_duplicates(inplace=True) - assert_series_equal(sc, tc1[~expected]) - - expected = Series([False, False, True, False]) - assert_series_equal(tc1.duplicated(keep='last'), expected) - assert_series_equal(tc1.drop_duplicates(keep='last'), tc1[~expected]) - sc = tc1.copy() - sc.drop_duplicates(keep='last', inplace=True) - assert_series_equal(sc, tc1[~expected]) - - expected = Series([False, False, True, True]) - assert_series_equal(tc1.duplicated(keep=False), expected) - assert_series_equal(tc1.drop_duplicates(keep=False), tc1[~expected]) - sc = tc1.copy() - sc.drop_duplicates(keep=False, inplace=True) - assert_series_equal(sc, tc1[~expected]) - - # Test case 2 - expected = Series([False, False, False, False, True, True, False]) - assert_series_equal(tc2.duplicated(), expected) - assert_series_equal(tc2.drop_duplicates(), tc2[~expected]) - sc = tc2.copy() - sc.drop_duplicates(inplace=True) - assert_series_equal(sc, tc2[~expected]) - - expected = Series([False, True, True, False, False, False, False]) - assert_series_equal(tc2.duplicated(keep='last'), expected) - assert_series_equal(tc2.drop_duplicates(keep='last'), tc2[~expected]) - sc = tc2.copy() - sc.drop_duplicates(keep='last', inplace=True) - assert_series_equal(sc, tc2[~expected]) - - expected = Series([False, True, True, False, True, True, False]) - assert_series_equal(tc2.duplicated(keep=False), expected) - assert_series_equal(tc2.drop_duplicates(keep=False), tc2[~expected]) - sc = tc2.copy() - sc.drop_duplicates(keep=False, inplace=True) - assert_series_equal(sc, tc2[~expected]) + ('first', Series([False, False, False, False, True, True, False])), + ('last', Series([False, True, True, False, False, False, False])), + (False, Series([False, True, True, False, True, True, False])) + ]) + @pytest.mark.parametrize('npdtype', ['int_', 'uint', 'float_', 'unicode_']) + def test_drop_duplicates_non_bool(self, npdtype, keep, expected): + tc = Series([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype(npdtype)) + + assert_series_equal(tc.duplicated(keep=keep), expected) + assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected]) + sc = tc.copy() + sc.drop_duplicates(keep=keep, inplace=True) + assert_series_equal(sc, tc[~expected]) - def test_drop_duplicates_bool(self): + @pytest.mark.parametrize('keep, expected', + [('first', Series([False, False, True, True])), + ('last', Series([True, True, False, False])), + (False, Series([True, True, True, True]))]) + def test_drop_duplicates_bool(self, keep, expected): tc = Series([True, False, True, False]) - expected = Series([False, False, True, True]) - assert_series_equal(tc.duplicated(), expected) - assert_series_equal(tc.drop_duplicates(), tc[~expected]) + assert_series_equal(tc.duplicated(keep=keep), expected) + assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected]) sc = tc.copy() - sc.drop_duplicates(inplace=True) + sc.drop_duplicates(keep=keep, inplace=True) assert_series_equal(sc, tc[~expected]) - expected = Series([True, True, False, False]) - assert_series_equal(tc.duplicated(keep='last'), expected) - assert_series_equal(tc.drop_duplicates(keep='last'), tc[~expected]) - sc = tc.copy() - sc.drop_duplicates(keep='last', inplace=True) - assert_series_equal(sc, tc[~expected]) + @pytest.mark.parametrize('keep, expected', [ + ('first', Series([False, False, True, False, True], name='name')), + ('last', Series([True, True, False, False, False], name='name')), + (False, Series([True, True, True, False, True], name='name')) + ]) + def test_duplicated_keep(self, keep, expected): + s = Series(['a', 'b', 'b', 'c', 'a'], name='name') - expected = Series([True, True, True, True]) - assert_series_equal(tc.duplicated(keep=False), expected) - assert_series_equal(tc.drop_duplicates(keep=False), tc[~expected]) - sc = tc.copy() - sc.drop_duplicates(keep=False, inplace=True) - assert_series_equal(sc, tc[~expected]) + result = s.duplicated(keep=keep) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize('keep, expected', [ + ('first', Series([False, False, True, False, True])), + ('last', Series([True, True, False, False, False])), + (False, Series([True, True, True, False, True])) + ]) + def test_duplicated_nan_none(self, keep, expected): + s = Series([np.nan, 3, 3, None, np.nan], dtype=object) + + result = s.duplicated(keep=keep) + tm.assert_series_equal(result, expected) def test_clip(self): val = self.ts.median() @@ -1416,7 +1378,8 @@ def test_ptp(self): N = 1000 arr = np.random.randn(N) ser = Series(arr) - assert np.ptp(ser) == np.ptp(arr) + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + assert np.ptp(ser) == np.ptp(arr) # GH11163 s = Series([3, 5, np.nan, -3, 10]) From c06442b5c86e7b62c5b6862db70e43239ff29572 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sat, 14 Jul 2018 17:16:31 +0200 Subject: [PATCH 2/7] Incorporate review (jreback) --- pandas/conftest.py | 33 ++++++++++++++++++++++++++- pandas/tests/series/test_analytics.py | 5 ++-- 2 files changed, 34 insertions(+), 4 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index c1376670ffbf0..49c49e84031ee 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -289,7 +289,6 @@ def complex_dtype(request): UNSIGNED_INT_DTYPES = ["uint8", "uint16", "uint32", "uint64"] SIGNED_INT_DTYPES = [int, "int8", "int16", "int32", "int64"] ALL_INT_DTYPES = UNSIGNED_INT_DTYPES + SIGNED_INT_DTYPES -ALL_REAL_DTYPES = FLOAT_DTYPES + ALL_INT_DTYPES @pytest.fixture(params=SIGNED_INT_DTYPES) @@ -338,6 +337,13 @@ def any_int_dtype(request): return request.param +FLOAT_DTYPES = [float, "float32", "float64"] +COMPLEX_DTYPES = [complex, "complex64", "complex128"] +STRING_DTYPES = [str, 'str', 'U'] +ALL_REAL_DTYPES = FLOAT_DTYPES + ALL_INT_DTYPES +ALL_NUMPY_DTYPES = ALL_REAL_DTYPES + STRING_DTYPES + COMPLEX_DTYPES + + @pytest.fixture(params=ALL_REAL_DTYPES) def any_real_dtype(request): """ @@ -358,6 +364,31 @@ def any_real_dtype(request): return request.param +@pytest.fixture(params=ALL_NUMPY_DTYPES) +def any_numpy_dtype(request): + """ + Parameterized fixture for any integer dtypes. + + * int8 + * uint8 + * int16 + * uint16 + * int32 + * uint32 + * int64 + * uint64 + * float32 + * float64 + * complex64 + * complex128 + * str + * 'str' + * 'U' + """ + + return request.param + + @pytest.fixture def mock(): """ diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 4ef30d5faf86b..7d9bff7bb48f5 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -962,9 +962,8 @@ def test_unique(self): ('last', Series([False, True, True, False, False, False, False])), (False, Series([False, True, True, False, True, True, False])) ]) - @pytest.mark.parametrize('npdtype', ['int_', 'uint', 'float_', 'unicode_']) - def test_drop_duplicates_non_bool(self, npdtype, keep, expected): - tc = Series([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype(npdtype)) + def test_drop_duplicates_non_bool(self, any_numpy_dtype, keep, expected): + tc = Series([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype(any_numpy_dtype)) assert_series_equal(tc.duplicated(keep=keep), expected) assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected]) From 92d708cbcfa8644d53fe108e40c2eb211cbfba17 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sat, 14 Jul 2018 17:47:52 +0200 Subject: [PATCH 3/7] Split off duplicates/unique tests into separate file --- pandas/tests/series/test_analytics.py | 126 ---------------------- pandas/tests/series/test_duplicates.py | 142 +++++++++++++++++++++++++ 2 files changed, 142 insertions(+), 126 deletions(-) create mode 100644 pandas/tests/series/test_duplicates.py diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py index 7d9bff7bb48f5..28a77bbb1d3fa 100644 --- a/pandas/tests/series/test_analytics.py +++ b/pandas/tests/series/test_analytics.py @@ -907,105 +907,6 @@ def test_matmul(self): pytest.raises(Exception, a.dot, a.values[:3]) pytest.raises(ValueError, a.dot, b.T) - def test_value_counts_nunique(self): - - # basics.rst doc example - series = Series(np.random.randn(500)) - series[20:500] = np.nan - series[10:20] = 5000 - result = series.nunique() - assert result == 11 - - # GH 18051 - s = pd.Series(pd.Categorical([])) - assert s.nunique() == 0 - s = pd.Series(pd.Categorical([np.nan])) - assert s.nunique() == 0 - - def test_unique(self): - - # 714 also, dtype=float - s = Series([1.2345] * 100) - s[::2] = np.nan - result = s.unique() - assert len(result) == 2 - - s = Series([1.2345] * 100, dtype='f4') - s[::2] = np.nan - result = s.unique() - assert len(result) == 2 - - # NAs in object arrays #714 - s = Series(['foo'] * 100, dtype='O') - s[::2] = np.nan - result = s.unique() - assert len(result) == 2 - - # decision about None - s = Series([1, 2, 3, None, None, None], dtype=object) - result = s.unique() - expected = np.array([1, 2, 3, None], dtype=object) - tm.assert_numpy_array_equal(result, expected) - - # GH 18051 - s = pd.Series(pd.Categorical([])) - tm.assert_categorical_equal(s.unique(), pd.Categorical([]), - check_dtype=False) - s = pd.Series(pd.Categorical([np.nan])) - tm.assert_categorical_equal(s.unique(), pd.Categorical([np.nan]), - check_dtype=False) - - @pytest.mark.parametrize( - 'keep, expected', - [ - ('first', Series([False, False, False, False, True, True, False])), - ('last', Series([False, True, True, False, False, False, False])), - (False, Series([False, True, True, False, True, True, False])) - ]) - def test_drop_duplicates_non_bool(self, any_numpy_dtype, keep, expected): - tc = Series([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype(any_numpy_dtype)) - - assert_series_equal(tc.duplicated(keep=keep), expected) - assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected]) - sc = tc.copy() - sc.drop_duplicates(keep=keep, inplace=True) - assert_series_equal(sc, tc[~expected]) - - @pytest.mark.parametrize('keep, expected', - [('first', Series([False, False, True, True])), - ('last', Series([True, True, False, False])), - (False, Series([True, True, True, True]))]) - def test_drop_duplicates_bool(self, keep, expected): - tc = Series([True, False, True, False]) - - assert_series_equal(tc.duplicated(keep=keep), expected) - assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected]) - sc = tc.copy() - sc.drop_duplicates(keep=keep, inplace=True) - assert_series_equal(sc, tc[~expected]) - - @pytest.mark.parametrize('keep, expected', [ - ('first', Series([False, False, True, False, True], name='name')), - ('last', Series([True, True, False, False, False], name='name')), - (False, Series([True, True, True, False, True], name='name')) - ]) - def test_duplicated_keep(self, keep, expected): - s = Series(['a', 'b', 'b', 'c', 'a'], name='name') - - result = s.duplicated(keep=keep) - tm.assert_series_equal(result, expected) - - @pytest.mark.parametrize('keep, expected', [ - ('first', Series([False, False, True, False, True])), - ('last', Series([True, True, False, False, False])), - (False, Series([True, True, True, False, True])) - ]) - def test_duplicated_nan_none(self, keep, expected): - s = Series([np.nan, 3, 3, None, np.nan], dtype=object) - - result = s.duplicated(keep=keep) - tm.assert_series_equal(result, expected) - def test_clip(self): val = self.ts.median() @@ -1419,10 +1320,6 @@ def test_empty_timeseries_redections_return_nat(self): assert Series([], dtype=dtype).min() is pd.NaT assert Series([], dtype=dtype).max() is pd.NaT - def test_unique_data_ownership(self): - # it works! #1807 - Series(Series(["a", "c", "b"]).unique()).sort_values() - def test_repeat(self): s = Series(np.random.randn(3), index=['a', 'b', 'c']) @@ -1499,29 +1396,6 @@ def test_searchsorted_sorter(self): e = np.array([0, 2], dtype=np.intp) tm.assert_numpy_array_equal(r, e) - def test_is_unique(self): - # GH11946 - s = Series(np.random.randint(0, 10, size=1000)) - assert not s.is_unique - s = Series(np.arange(1000)) - assert s.is_unique - - def test_is_unique_class_ne(self, capsys): - # GH 20661 - class Foo(object): - def __init__(self, val): - self._value = val - - def __ne__(self, other): - raise Exception("NEQ not supported") - - li = [Foo(i) for i in range(5)] - s = pd.Series(li, index=[i for i in range(5)]) - _, err = capsys.readouterr() - s.is_unique - _, err = capsys.readouterr() - assert len(err) == 0 - def test_is_monotonic(self): s = Series(np.random.randint(0, 10, size=1000)) diff --git a/pandas/tests/series/test_duplicates.py b/pandas/tests/series/test_duplicates.py new file mode 100644 index 0000000000000..90cf4831c8413 --- /dev/null +++ b/pandas/tests/series/test_duplicates.py @@ -0,0 +1,142 @@ +# coding=utf-8 +# pylint: disable-msg=E1101,W0612 + +import pytest + +import numpy as np +import pandas as pd + +from pandas import Series + +from pandas.util.testing import assert_series_equal +import pandas.util.testing as tm +from .common import TestData + + +class TestSeriesDuplicates(TestData): + + def test_value_counts_nunique(self): + + # basics.rst doc example + series = Series(np.random.randn(500)) + series[20:500] = np.nan + series[10:20] = 5000 + result = series.nunique() + assert result == 11 + + # GH 18051 + s = pd.Series(pd.Categorical([])) + assert s.nunique() == 0 + s = pd.Series(pd.Categorical([np.nan])) + assert s.nunique() == 0 + + def test_unique(self): + + # 714 also, dtype=float + s = Series([1.2345] * 100) + s[::2] = np.nan + result = s.unique() + assert len(result) == 2 + + s = Series([1.2345] * 100, dtype='f4') + s[::2] = np.nan + result = s.unique() + assert len(result) == 2 + + # NAs in object arrays #714 + s = Series(['foo'] * 100, dtype='O') + s[::2] = np.nan + result = s.unique() + assert len(result) == 2 + + # decision about None + s = Series([1, 2, 3, None, None, None], dtype=object) + result = s.unique() + expected = np.array([1, 2, 3, None], dtype=object) + tm.assert_numpy_array_equal(result, expected) + + # GH 18051 + s = pd.Series(pd.Categorical([])) + tm.assert_categorical_equal(s.unique(), pd.Categorical([]), + check_dtype=False) + s = pd.Series(pd.Categorical([np.nan])) + tm.assert_categorical_equal(s.unique(), pd.Categorical([np.nan]), + check_dtype=False) + + def test_unique_data_ownership(self): + # it works! #1807 + Series(Series(["a", "c", "b"]).unique()).sort_values() + + def test_is_unique(self): + # GH11946 + s = Series(np.random.randint(0, 10, size=1000)) + assert not s.is_unique + s = Series(np.arange(1000)) + assert s.is_unique + + def test_is_unique_class_ne(self, capsys): + # GH 20661 + class Foo(object): + def __init__(self, val): + self._value = val + + def __ne__(self, other): + raise Exception("NEQ not supported") + + li = [Foo(i) for i in range(5)] + s = pd.Series(li, index=[i for i in range(5)]) + _, err = capsys.readouterr() + s.is_unique + _, err = capsys.readouterr() + assert len(err) == 0 + + @pytest.mark.parametrize( + 'keep, expected', + [ + ('first', Series([False, False, False, False, True, True, False])), + ('last', Series([False, True, True, False, False, False, False])), + (False, Series([False, True, True, False, True, True, False])) + ]) + def test_drop_duplicates_non_bool(self, any_numpy_dtype, keep, expected): + tc = Series([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype(any_numpy_dtype)) + + assert_series_equal(tc.duplicated(keep=keep), expected) + assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected]) + sc = tc.copy() + sc.drop_duplicates(keep=keep, inplace=True) + assert_series_equal(sc, tc[~expected]) + + @pytest.mark.parametrize('keep, expected', + [('first', Series([False, False, True, True])), + ('last', Series([True, True, False, False])), + (False, Series([True, True, True, True]))]) + def test_drop_duplicates_bool(self, keep, expected): + tc = Series([True, False, True, False]) + + assert_series_equal(tc.duplicated(keep=keep), expected) + assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected]) + sc = tc.copy() + sc.drop_duplicates(keep=keep, inplace=True) + assert_series_equal(sc, tc[~expected]) + + @pytest.mark.parametrize('keep, expected', [ + ('first', Series([False, False, True, False, True], name='name')), + ('last', Series([True, True, False, False, False], name='name')), + (False, Series([True, True, True, False, True], name='name')) + ]) + def test_duplicated_keep(self, keep, expected): + s = Series(['a', 'b', 'b', 'c', 'a'], name='name') + + result = s.duplicated(keep=keep) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize('keep, expected', [ + ('first', Series([False, False, True, False, True])), + ('last', Series([True, True, False, False, False])), + (False, Series([True, True, True, False, True])) + ]) + def test_duplicated_nan_none(self, keep, expected): + s = Series([np.nan, 3, 3, None, np.nan], dtype=object) + + result = s.duplicated(keep=keep) + tm.assert_series_equal(result, expected) From 12d288835e8660f8a2981f59ada78baea16c6bc1 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sat, 14 Jul 2018 22:03:52 +0200 Subject: [PATCH 4/7] Change tests away from classes; reorder conftest.py --- pandas/conftest.py | 33 ++-- pandas/tests/series/test_duplicates.py | 257 +++++++++++++------------ 2 files changed, 145 insertions(+), 145 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 49c49e84031ee..133a888830b67 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -248,7 +248,19 @@ def tz_aware_fixture(request): return request.param -@pytest.fixture(params=[str, 'str', 'U']) +UNSIGNED_INT_DTYPES = ["uint8", "uint16", "uint32", "uint64"] +SIGNED_INT_DTYPES = [int, "int8", "int16", "int32", "int64"] +ALL_INT_DTYPES = UNSIGNED_INT_DTYPES + SIGNED_INT_DTYPES + +FLOAT_DTYPES = [float, "float32", "float64"] +COMPLEX_DTYPES = [complex, "complex64", "complex128"] +STRING_DTYPES = [str, 'str', 'U'] + +ALL_REAL_DTYPES = FLOAT_DTYPES + ALL_INT_DTYPES +ALL_NUMPY_DTYPES = ALL_REAL_DTYPES + STRING_DTYPES + COMPLEX_DTYPES + + +@pytest.fixture(params=STRING_DTYPES) def string_dtype(request): """Parametrized fixture for string dtypes. @@ -259,9 +271,6 @@ def string_dtype(request): return request.param -FLOAT_DTYPES = [float, "float32", "float64"] - - @pytest.fixture(params=FLOAT_DTYPES) def float_dtype(request): """ @@ -274,7 +283,7 @@ def float_dtype(request): return request.param -@pytest.fixture(params=[complex, "complex64", "complex128"]) +@pytest.fixture(params=COMPLEX_DTYPES) def complex_dtype(request): """ Parameterized fixture for complex dtypes. @@ -286,11 +295,6 @@ def complex_dtype(request): return request.param -UNSIGNED_INT_DTYPES = ["uint8", "uint16", "uint32", "uint64"] -SIGNED_INT_DTYPES = [int, "int8", "int16", "int32", "int64"] -ALL_INT_DTYPES = UNSIGNED_INT_DTYPES + SIGNED_INT_DTYPES - - @pytest.fixture(params=SIGNED_INT_DTYPES) def sint_dtype(request): """ @@ -337,13 +341,6 @@ def any_int_dtype(request): return request.param -FLOAT_DTYPES = [float, "float32", "float64"] -COMPLEX_DTYPES = [complex, "complex64", "complex128"] -STRING_DTYPES = [str, 'str', 'U'] -ALL_REAL_DTYPES = FLOAT_DTYPES + ALL_INT_DTYPES -ALL_NUMPY_DTYPES = ALL_REAL_DTYPES + STRING_DTYPES + COMPLEX_DTYPES - - @pytest.fixture(params=ALL_REAL_DTYPES) def any_real_dtype(request): """ @@ -367,7 +364,7 @@ def any_real_dtype(request): @pytest.fixture(params=ALL_NUMPY_DTYPES) def any_numpy_dtype(request): """ - Parameterized fixture for any integer dtypes. + Parameterized fixture for any numpy dtypes. * int8 * uint8 diff --git a/pandas/tests/series/test_duplicates.py b/pandas/tests/series/test_duplicates.py index 90cf4831c8413..ed6255084b398 100644 --- a/pandas/tests/series/test_duplicates.py +++ b/pandas/tests/series/test_duplicates.py @@ -10,133 +10,136 @@ from pandas.util.testing import assert_series_equal import pandas.util.testing as tm -from .common import TestData - - -class TestSeriesDuplicates(TestData): - - def test_value_counts_nunique(self): - - # basics.rst doc example - series = Series(np.random.randn(500)) - series[20:500] = np.nan - series[10:20] = 5000 - result = series.nunique() - assert result == 11 - - # GH 18051 - s = pd.Series(pd.Categorical([])) - assert s.nunique() == 0 - s = pd.Series(pd.Categorical([np.nan])) - assert s.nunique() == 0 - - def test_unique(self): - - # 714 also, dtype=float - s = Series([1.2345] * 100) - s[::2] = np.nan - result = s.unique() - assert len(result) == 2 - - s = Series([1.2345] * 100, dtype='f4') - s[::2] = np.nan - result = s.unique() - assert len(result) == 2 - - # NAs in object arrays #714 - s = Series(['foo'] * 100, dtype='O') - s[::2] = np.nan - result = s.unique() - assert len(result) == 2 - - # decision about None - s = Series([1, 2, 3, None, None, None], dtype=object) - result = s.unique() - expected = np.array([1, 2, 3, None], dtype=object) - tm.assert_numpy_array_equal(result, expected) - - # GH 18051 - s = pd.Series(pd.Categorical([])) - tm.assert_categorical_equal(s.unique(), pd.Categorical([]), - check_dtype=False) - s = pd.Series(pd.Categorical([np.nan])) - tm.assert_categorical_equal(s.unique(), pd.Categorical([np.nan]), - check_dtype=False) - - def test_unique_data_ownership(self): - # it works! #1807 - Series(Series(["a", "c", "b"]).unique()).sort_values() - - def test_is_unique(self): - # GH11946 - s = Series(np.random.randint(0, 10, size=1000)) - assert not s.is_unique - s = Series(np.arange(1000)) - assert s.is_unique - - def test_is_unique_class_ne(self, capsys): - # GH 20661 - class Foo(object): - def __init__(self, val): - self._value = val - - def __ne__(self, other): - raise Exception("NEQ not supported") - - li = [Foo(i) for i in range(5)] - s = pd.Series(li, index=[i for i in range(5)]) - _, err = capsys.readouterr() - s.is_unique - _, err = capsys.readouterr() - assert len(err) == 0 - - @pytest.mark.parametrize( - 'keep, expected', - [ - ('first', Series([False, False, False, False, True, True, False])), - ('last', Series([False, True, True, False, False, False, False])), - (False, Series([False, True, True, False, True, True, False])) - ]) - def test_drop_duplicates_non_bool(self, any_numpy_dtype, keep, expected): - tc = Series([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype(any_numpy_dtype)) - - assert_series_equal(tc.duplicated(keep=keep), expected) - assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected]) - sc = tc.copy() - sc.drop_duplicates(keep=keep, inplace=True) - assert_series_equal(sc, tc[~expected]) - - @pytest.mark.parametrize('keep, expected', - [('first', Series([False, False, True, True])), - ('last', Series([True, True, False, False])), - (False, Series([True, True, True, True]))]) - def test_drop_duplicates_bool(self, keep, expected): - tc = Series([True, False, True, False]) - - assert_series_equal(tc.duplicated(keep=keep), expected) - assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected]) - sc = tc.copy() - sc.drop_duplicates(keep=keep, inplace=True) - assert_series_equal(sc, tc[~expected]) - - @pytest.mark.parametrize('keep, expected', [ - ('first', Series([False, False, True, False, True], name='name')), - ('last', Series([True, True, False, False, False], name='name')), - (False, Series([True, True, True, False, True], name='name')) - ]) - def test_duplicated_keep(self, keep, expected): - s = Series(['a', 'b', 'b', 'c', 'a'], name='name') - result = s.duplicated(keep=keep) - tm.assert_series_equal(result, expected) - @pytest.mark.parametrize('keep, expected', [ - ('first', Series([False, False, True, False, True])), - ('last', Series([True, True, False, False, False])), - (False, Series([True, True, True, False, True])) +def test_value_counts_nunique(): + # basics.rst doc example + series = Series(np.random.randn(500)) + series[20:500] = np.nan + series[10:20] = 5000 + result = series.nunique() + assert result == 11 + + # GH 18051 + s = pd.Series(pd.Categorical([])) + assert s.nunique() == 0 + s = pd.Series(pd.Categorical([np.nan])) + assert s.nunique() == 0 + + +def test_unique(): + # GH714 also, dtype=float + s = Series([1.2345] * 100) + s[::2] = np.nan + result = s.unique() + assert len(result) == 2 + + s = Series([1.2345] * 100, dtype='f4') + s[::2] = np.nan + result = s.unique() + assert len(result) == 2 + + # NAs in object arrays #714 + s = Series(['foo'] * 100, dtype='O') + s[::2] = np.nan + result = s.unique() + assert len(result) == 2 + + # decision about None + s = Series([1, 2, 3, None, None, None], dtype=object) + result = s.unique() + expected = np.array([1, 2, 3, None], dtype=object) + tm.assert_numpy_array_equal(result, expected) + + # GH 18051 + s = pd.Series(pd.Categorical([])) + tm.assert_categorical_equal(s.unique(), pd.Categorical([]), + check_dtype=False) + s = pd.Series(pd.Categorical([np.nan])) + tm.assert_categorical_equal(s.unique(), pd.Categorical([np.nan]), + check_dtype=False) + + +def test_unique_data_ownership(): + # it works! #1807 + Series(Series(["a", "c", "b"]).unique()).sort_values() + + +def test_is_unique(): + # GH11946 + s = Series(np.random.randint(0, 10, size=1000)) + assert not s.is_unique + s = Series(np.arange(1000)) + assert s.is_unique + + +def test_is_unique_class_ne(capsys): + # GH 20661 + class Foo(object): + def __init__(self, val): + self._value = val + + def __ne__(self, other): + raise Exception("NEQ not supported") + + li = [Foo(i) for i in range(5)] + s = pd.Series(li, index=[i for i in range(5)]) + _, err = capsys.readouterr() + s.is_unique + _, err = capsys.readouterr() + assert len(err) == 0 + + +@pytest.mark.parametrize( + 'keep, expected', + [ + ('first', Series([False, False, False, False, True, True, False])), + ('last', Series([False, True, True, False, False, False, False])), + (False, Series([False, True, True, False, True, True, False])) ]) - def test_duplicated_nan_none(self, keep, expected): - s = Series([np.nan, 3, 3, None, np.nan], dtype=object) - - result = s.duplicated(keep=keep) - tm.assert_series_equal(result, expected) +def test_drop_duplicates_non_bool(any_numpy_dtype, keep, expected): + tc = Series([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype(any_numpy_dtype)) + + assert_series_equal(tc.duplicated(keep=keep), expected) + assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected]) + sc = tc.copy() + sc.drop_duplicates(keep=keep, inplace=True) + assert_series_equal(sc, tc[~expected]) + + +@pytest.mark.parametrize('keep, expected', + [('first', Series([False, False, True, True])), + ('last', Series([True, True, False, False])), + (False, Series([True, True, True, True]))]) +def test_drop_duplicates_bool(keep, expected): + tc = Series([True, False, True, False]) + + assert_series_equal(tc.duplicated(keep=keep), expected) + assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected]) + sc = tc.copy() + sc.drop_duplicates(keep=keep, inplace=True) + assert_series_equal(sc, tc[~expected]) + + +@pytest.mark.parametrize('keep, expected', [ + ('first', Series([False, False, True, False, True], name='name')), + ('last', Series([True, True, False, False, False], name='name')), + (False, Series([True, True, True, False, True], name='name')) +]) +def test_duplicated_keep(keep, expected): + s = Series(['a', 'b', 'b', 'c', 'a'], name='name') + + result = s.duplicated(keep=keep) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize('keep, expected', [ + ('first', Series([False, False, True, False, True])), + ('last', Series([True, True, False, False, False])), + (False, Series([True, True, True, False, True])) +]) +def test_duplicated_nan_none(keep, expected): + s = Series([np.nan, 3, 3, None, np.nan], dtype=object) + + result = s.duplicated(keep=keep) + tm.assert_series_equal(result, expected) From c9a3f71fd5fcbeecd0a26cf878985dbcf647f76d Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sat, 14 Jul 2018 22:19:47 +0200 Subject: [PATCH 5/7] Typos --- pandas/conftest.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 133a888830b67..a979c3fc3bfac 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -257,7 +257,7 @@ def tz_aware_fixture(request): STRING_DTYPES = [str, 'str', 'U'] ALL_REAL_DTYPES = FLOAT_DTYPES + ALL_INT_DTYPES -ALL_NUMPY_DTYPES = ALL_REAL_DTYPES + STRING_DTYPES + COMPLEX_DTYPES +ALL_NUMPY_DTYPES = ALL_REAL_DTYPES + COMPLEX_DTYPES + STRING_DTYPES @pytest.fixture(params=STRING_DTYPES) @@ -364,7 +364,7 @@ def any_real_dtype(request): @pytest.fixture(params=ALL_NUMPY_DTYPES) def any_numpy_dtype(request): """ - Parameterized fixture for any numpy dtypes. + Parameterized fixture for all numpy dtypes. * int8 * uint8 From d34b7d17ac2b81827c8a28d00291be3725b89ef0 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sat, 14 Jul 2018 22:29:54 +0200 Subject: [PATCH 6/7] Further cleanup --- pandas/tests/series/test_duplicates.py | 32 +++++++++++--------------- 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/pandas/tests/series/test_duplicates.py b/pandas/tests/series/test_duplicates.py index ed6255084b398..2135800a4ae4d 100644 --- a/pandas/tests/series/test_duplicates.py +++ b/pandas/tests/series/test_duplicates.py @@ -1,14 +1,11 @@ # coding=utf-8 -# pylint: disable-msg=E1101,W0612 import pytest import numpy as np -import pandas as pd -from pandas import Series +from pandas import Series, Categorical -from pandas.util.testing import assert_series_equal import pandas.util.testing as tm @@ -21,9 +18,9 @@ def test_value_counts_nunique(): assert result == 11 # GH 18051 - s = pd.Series(pd.Categorical([])) + s = Series(Categorical([])) assert s.nunique() == 0 - s = pd.Series(pd.Categorical([np.nan])) + s = Series(Categorical([np.nan])) assert s.nunique() == 0 @@ -52,11 +49,10 @@ def test_unique(): tm.assert_numpy_array_equal(result, expected) # GH 18051 - s = pd.Series(pd.Categorical([])) - tm.assert_categorical_equal(s.unique(), pd.Categorical([]), - check_dtype=False) - s = pd.Series(pd.Categorical([np.nan])) - tm.assert_categorical_equal(s.unique(), pd.Categorical([np.nan]), + s = Series(Categorical([])) + tm.assert_categorical_equal(s.unique(), Categorical([]), check_dtype=False) + s = Series(Categorical([np.nan])) + tm.assert_categorical_equal(s.unique(), Categorical([np.nan]), check_dtype=False) @@ -83,7 +79,7 @@ def __ne__(self, other): raise Exception("NEQ not supported") li = [Foo(i) for i in range(5)] - s = pd.Series(li, index=[i for i in range(5)]) + s = Series(li, index=[i for i in range(5)]) _, err = capsys.readouterr() s.is_unique _, err = capsys.readouterr() @@ -100,11 +96,11 @@ def __ne__(self, other): def test_drop_duplicates_non_bool(any_numpy_dtype, keep, expected): tc = Series([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype(any_numpy_dtype)) - assert_series_equal(tc.duplicated(keep=keep), expected) - assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected]) + tm.assert_series_equal(tc.duplicated(keep=keep), expected) + tm.assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected]) sc = tc.copy() sc.drop_duplicates(keep=keep, inplace=True) - assert_series_equal(sc, tc[~expected]) + tm.assert_series_equal(sc, tc[~expected]) @pytest.mark.parametrize('keep, expected', @@ -114,11 +110,11 @@ def test_drop_duplicates_non_bool(any_numpy_dtype, keep, expected): def test_drop_duplicates_bool(keep, expected): tc = Series([True, False, True, False]) - assert_series_equal(tc.duplicated(keep=keep), expected) - assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected]) + tm.assert_series_equal(tc.duplicated(keep=keep), expected) + tm.assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected]) sc = tc.copy() sc.drop_duplicates(keep=keep, inplace=True) - assert_series_equal(sc, tc[~expected]) + tm.assert_series_equal(sc, tc[~expected]) @pytest.mark.parametrize('keep, expected', [ From e64004039d9c358684f455479ab7c7d75e904a11 Mon Sep 17 00:00:00 2001 From: "H. Vetinari" Date: Sat, 14 Jul 2018 22:45:25 +0200 Subject: [PATCH 7/7] Remove space --- pandas/tests/series/test_duplicates.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/series/test_duplicates.py b/pandas/tests/series/test_duplicates.py index 2135800a4ae4d..2e4d64188307c 100644 --- a/pandas/tests/series/test_duplicates.py +++ b/pandas/tests/series/test_duplicates.py @@ -5,7 +5,6 @@ import numpy as np from pandas import Series, Categorical - import pandas.util.testing as tm