Skip to content

Feature: draw percentiles in violinplot #14107

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 33 commits into from
May 11, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
e1985d7
Fix issue 10788
sasoripathos Mar 22, 2019
2279b16
Fix style
sasoripathos Mar 22, 2019
19bb0c7
Add missing import statement
sasoripathos Mar 22, 2019
b7fdc9b
Merge remote-tracking branch 'upstream/master'
sasoripathos Apr 6, 2019
109f252
Merge remote-tracking branch 'upstream/master'
sasoripathos Apr 19, 2019
b2ca609
Merge remote-tracking branch 'upstream/master'
sasoripathos Apr 24, 2019
2c5a873
Merge remote-tracking branch 'upstream/master'
sasoripathos Apr 30, 2019
cb0a64c
Remove changes
sasoripathos Apr 30, 2019
67664d5
Code for feature 8532
sasoripathos Apr 30, 2019
ea5dd1c
Run boilerplate
sasoripathos Apr 30, 2019
5552d3d
Fix code style
sasoripathos Apr 30, 2019
3fd9152
Remove unused test images
sasoripathos Apr 30, 2019
80d3fd8
Update pyplot.py
sasoripathos Apr 30, 2019
f78fcac
Update pyplot.py
sasoripathos Apr 30, 2019
dbda148
Run boilerplate
sasoripathos Apr 30, 2019
fb32794
Remove extra image-compare tests, update documentation
sasoripathos May 1, 2019
b8ae7bd
Merge remote-tracking branch 'upstream/master'
sasoripathos May 1, 2019
0bb79ec
Merge branch 'master' into feature8532
sasoripathos May 1, 2019
3d46d7f
Merge remote-tracking branch 'upstream/master' into feature8532
sasoripathos May 7, 2019
a3912a9
Use np.quantile instead of percentile
sasoripathos May 7, 2019
6e50219
Merge remote-tracking branch 'upstream/master'
sasoripathos May 7, 2019
ee5cd08
Update test images for violinplot
sasoripathos May 7, 2019
999a424
Merge branch 'master' into feature8532
sasoripathos May 7, 2019
eafd766
Revert "Update test images for violinplot"
sasoripathos May 7, 2019
e48658f
Use np.percentile implement quantile
sasoripathos May 7, 2019
8cbafae
Merge remote-tracking branch 'upstream/master'
sasoripathos May 8, 2019
6f3e2a0
Merge branch 'master' into feature8532
sasoripathos May 8, 2019
7a4777b
Update api from percentiles to quantiles
sasoripathos May 9, 2019
5d80c50
Merge branch 'feature8532' of https://github.com/sasoripathos/matplot…
sasoripathos May 9, 2019
6afe73a
Update violinplot call in test cases
sasoripathos May 9, 2019
54f955a
Update documentations from percentiles to quantiles
sasoripathos May 9, 2019
a0c3073
Update docstring and backward campatibility
sasoripathos May 10, 2019
c586932
Remove unnecessary api change doc
sasoripathos May 11, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 27 additions & 4 deletions examples/statistics/violinplot.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
pos = [1, 2, 4, 5, 7, 8]
data = [np.random.normal(0, std, size=100) for std in pos]

fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(6, 6))
fig, axes = plt.subplots(nrows=2, ncols=5, figsize=(10, 6))

axes[0, 0].violinplot(data, pos, points=20, widths=0.3,
showmeans=True, showextrema=True, showmedians=True)
Expand All @@ -43,19 +43,42 @@
showextrema=True, showmedians=True, bw_method=0.5)
axes[0, 2].set_title('Custom violinplot 3', fontsize=fs)

axes[0, 3].violinplot(data, pos, points=60, widths=0.7, showmeans=True,
showextrema=True, showmedians=True, bw_method=0.5,
quantiles=[[0.1], [], [], [0.175, 0.954], [0.75],
[0.25]])
axes[0, 3].set_title('Custom violinplot 4', fontsize=fs)

axes[0, 4].violinplot(data[-1:], pos[-1:], points=60, widths=0.7,
showmeans=True, showextrema=True, showmedians=True,
quantiles=[0.05, 0.1, 0.8, 0.9], bw_method=0.5)
axes[0, 4].set_title('Custom violinplot 5', fontsize=fs)

axes[1, 0].violinplot(data, pos, points=80, vert=False, widths=0.7,
showmeans=True, showextrema=True, showmedians=True)
axes[1, 0].set_title('Custom violinplot 4', fontsize=fs)
axes[1, 0].set_title('Custom violinplot 6', fontsize=fs)

axes[1, 1].violinplot(data, pos, points=100, vert=False, widths=0.9,
showmeans=True, showextrema=True, showmedians=True,
bw_method='silverman')
axes[1, 1].set_title('Custom violinplot 5', fontsize=fs)
axes[1, 1].set_title('Custom violinplot 7', fontsize=fs)

axes[1, 2].violinplot(data, pos, points=200, vert=False, widths=1.1,
showmeans=True, showextrema=True, showmedians=True,
bw_method=0.5)
axes[1, 2].set_title('Custom violinplot 6', fontsize=fs)
axes[1, 2].set_title('Custom violinplot 8', fontsize=fs)

axes[1, 3].violinplot(data, pos, points=200, vert=False, widths=1.1,
showmeans=True, showextrema=True, showmedians=True,
quantiles=[[0.1], [], [], [0.175, 0.954], [0.75],
[0.25]],
bw_method=0.5)
axes[1, 3].set_title('Custom violinplot 9', fontsize=fs)

axes[1, 4].violinplot(data[-1:], pos[-1:], points=200, vert=False, widths=1.1,
showmeans=True, showextrema=True, showmedians=True,
quantiles=[0.05, 0.1, 0.8, 0.9], bw_method=0.5)
axes[1, 4].set_title('Custom violinplot 10', fontsize=fs)

for ax in axes.flat:
ax.set_yticklabels([])
Expand Down
49 changes: 45 additions & 4 deletions lib/matplotlib/axes/_axes.py
Original file line number Diff line number Diff line change
Expand Up @@ -7880,14 +7880,14 @@ def matshow(self, Z, **kwargs):
@_preprocess_data(replace_names=["dataset"])
def violinplot(self, dataset, positions=None, vert=True, widths=0.5,
showmeans=False, showextrema=True, showmedians=False,
points=100, bw_method=None):
quantiles=None, points=100, bw_method=None):
"""
Make a violin plot.

Make a violin plot for each column of *dataset* or each vector in
sequence *dataset*. Each filled area extends to represent the
entire data range, with optional lines at the mean, the median,
the minimum, and the maximum.
the minimum, the maximum, and user-specified quantiles.

Parameters
----------
Expand Down Expand Up @@ -7916,6 +7916,11 @@ def violinplot(self, dataset, positions=None, vert=True, widths=0.5,
showmedians : bool, default = False
If `True`, will toggle rendering of the medians.

quantiles : array-like, default = None
If not None, set a list of floats in interval [0, 1] for each violin,
which stands for the quantiles that will be rendered for that
violin.

points : scalar, default = 100
Defines the number of points to evaluate each of the
gaussian kernel density estimations at.
Expand Down Expand Up @@ -7953,6 +7958,10 @@ def violinplot(self, dataset, positions=None, vert=True, widths=0.5,
- ``cmedians``: A `~.collections.LineCollection` instance that
marks the median values of each of the violin's distribution.

- ``cquantiles``: A `~.collections.LineCollection` instance created
to identify the quantile values of each of the violin's
distribution.

"""

def _kde_method(X, coords):
Expand All @@ -7962,7 +7971,8 @@ def _kde_method(X, coords):
kde = mlab.GaussianKDE(X, bw_method)
return kde.evaluate(coords)

vpstats = cbook.violin_stats(dataset, _kde_method, points=points)
vpstats = cbook.violin_stats(dataset, _kde_method, points=points,
quantiles=quantiles)
return self.violin(vpstats, positions=positions, vert=vert,
widths=widths, showmeans=showmeans,
showextrema=showextrema, showmedians=showmedians)
Expand All @@ -7973,7 +7983,7 @@ def violin(self, vpstats, positions=None, vert=True, widths=0.5,

Draw a violin plot for each column of `vpstats`. Each filled area
extends to represent the entire data range, with optional lines at the
mean, the median, the minimum, and the maximum.
mean, the median, the minimum, the maximum, and the quantiles values.

Parameters
----------
Expand All @@ -7997,6 +8007,11 @@ def violin(self, vpstats, positions=None, vert=True, widths=0.5,

- ``max``: The maximum value for this violin's dataset.

Optional keys are:

- ``quantiles``: A list of scalars containing the quantile values
for this violin's dataset.

positions : array-like, default = [1, 2, ..., n]
Sets the positions of the violins. The ticks and limits are
automatically set to match the positions.
Expand Down Expand Up @@ -8043,13 +8058,19 @@ def violin(self, vpstats, positions=None, vert=True, widths=0.5,

- ``cmedians``: A `~.collections.LineCollection` instance that
marks the median values of each of the violin's distribution.

- ``cquantiles``: A `~.collections.LineCollection` instance created
to identify the quantiles values of each of the violin's
distribution.

"""

# Statistical quantities to be plotted on the violins
means = []
mins = []
maxes = []
medians = []
quantiles = np.asarray([])

# Collections to be returned
artists = {}
Expand Down Expand Up @@ -8106,6 +8127,10 @@ def violin(self, vpstats, positions=None, vert=True, widths=0.5,
mins.append(stats['min'])
maxes.append(stats['max'])
medians.append(stats['median'])
q = stats.get('quantiles')
if q is not None:
# If exist key quantiles, assume it's a list of floats
quantiles = np.concatenate((quantiles, q))
artists['bodies'] = bodies

# Render means
Expand All @@ -8129,6 +8154,22 @@ def violin(self, vpstats, positions=None, vert=True, widths=0.5,
pmaxes,
colors=edgecolor)

# Render quantile values
if quantiles.size > 0:
# Recalculate ranges for statistics lines for quantiles.
# ppmins are the left end of quantiles lines
ppmins = np.asarray([])
# pmaxes are the right end of quantiles lines
ppmaxs = np.asarray([])
for stats, cmin, cmax in zip(vpstats, pmins, pmaxes):
q = stats.get('quantiles')
if q is not None:
ppmins = np.concatenate((ppmins, [cmin] * np.size(q)))
ppmaxs = np.concatenate((ppmaxs, [cmax] * np.size(q)))
# Start rendering
artists['cquantiles'] = perp_lines(quantiles, ppmins, ppmaxs,
colors=edgecolor)

return artists

# Methods that are entirely implemented in other modules.
Expand Down
26 changes: 24 additions & 2 deletions lib/matplotlib/cbook/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -1431,7 +1431,7 @@ def _reshape_2D(X, name):
raise ValueError("{} must have 2 or fewer dimensions".format(name))


def violin_stats(X, method, points=100):
def violin_stats(X, method, points=100, quantiles=None):
"""
Returns a list of dictionaries of data which can be used to draw a series
of violin plots. See the `Returns` section below to view the required keys
Expand All @@ -1455,6 +1455,12 @@ def violin_stats(X, method, points=100):
Defines the number of points to evaluate each of the gaussian kernel
density estimates at.

quantiles : array-like, default = None
Defines (if not None) a list of floats in interval [0, 1] for each
column of data, which represents the quantiles that will be rendered
for that column of data. Must have 2 or fewer dimensions. 1D array will
be treated as a singleton list containing them.

Returns
-------

Expand All @@ -1469,6 +1475,7 @@ def violin_stats(X, method, points=100):
- median: The median value for this column of data.
- min: The minimum value for this column of data.
- max: The maximum value for this column of data.
- quantiles: The quantile values for this column of data.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a breaking API change. I'm afraid, there's no way around that, but it at least needs an API change note.

Copy link
Contributor Author

@sasoripathos sasoripathos May 10, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am confused. If user directly use this function as API, my code only extends the results that user can get. If this extension is considered as a "break", then I don't think it is necessary to make qunatiles key optional. According to current code, the option, whether to add a Linecollection, is decided by violin().

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If user never directly use this function, then it is only called by violinplot(). Should this be counted as a API though?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Matplotlib is extensively used out in the wild. By experience, for every remote but possible way of using the official API, there's someone out there, who actually has code with it. Therefore, we've become quite sensitive to possible API breaks. They will pop up as bug reports sooner or later. So we have to at least document the changes.

Tuple unpacking of the result will break with the change:

coords, vals, mean, median, min, max = violin_stats()

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree to have a document. But I feel it is odd to make quantiles optional since the current code in violin_stats doesn't care whether those value will be drawn out.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm sorry, I misread the documentation here. It's a list of dictionaries with one dictionary per input dataset. Adding keys to these dicts is not breaking the API. So the above API change note is not necessary and should be removed again. Sorry for the noise.

"""

# List of dictionaries describing each of the violins.
Expand All @@ -1477,13 +1484,27 @@ def violin_stats(X, method, points=100):
# Want X to be a list of data sequences
X = _reshape_2D(X, "X")

for x in X:
# Want quantiles to be as the same shape as data sequences
if quantiles is not None and len(quantiles) != 0:
quantiles = _reshape_2D(quantiles, "quantiles")
# Else, mock quantiles if is none or empty
else:
quantiles = [[]] * np.shape(X)[0]

# quantiles should has the same size as dataset
if np.shape(X)[:1] != np.shape(quantiles)[:1]:
raise ValueError("List of violinplot statistics and quantiles values"
" must have the same length")

# Zip x and quantiles
for (x, q) in zip(X, quantiles):
# Dictionary of results for this distribution
stats = {}

# Calculate basic stats for the distribution
min_val = np.min(x)
max_val = np.max(x)
quantile_val = np.percentile(x, 100 * q)

# Evaluate the kernel density estimate
coords = np.linspace(min_val, max_val, points)
Expand All @@ -1495,6 +1516,7 @@ def violin_stats(X, method, points=100):
stats['median'] = np.median(x)
stats['min'] = min_val
stats['max'] = max_val
stats['quantiles'] = np.atleast_1d(quantile_val)

# Append to output
vpstats.append(stats)
Expand Down
7 changes: 4 additions & 3 deletions lib/matplotlib/pyplot.py
Original file line number Diff line number Diff line change
Expand Up @@ -2998,12 +2998,13 @@ def triplot(*args, **kwargs):
def violinplot(
dataset, positions=None, vert=True, widths=0.5,
showmeans=False, showextrema=True, showmedians=False,
points=100, bw_method=None, *, data=None):
quantiles=None, points=100, bw_method=None, *, data=None):
return gca().violinplot(
dataset, positions=positions, vert=vert, widths=widths,
showmeans=showmeans, showextrema=showextrema,
showmedians=showmedians, points=points, bw_method=bw_method,
**({"data": data} if data is not None else {}))
showmedians=showmedians, quantiles=quantiles, points=points,
bw_method=bw_method, **({"data": data} if data is not None
else {}))


# Autogenerated by boilerplate.py. Do not edit as changes will be lost.
Expand Down
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
48 changes: 46 additions & 2 deletions lib/matplotlib/tests/test_axes.py
Original file line number Diff line number Diff line change
Expand Up @@ -2661,7 +2661,8 @@ def test_vert_violinplot_showall():
np.random.seed(316624790)
data = [np.random.normal(size=100) for i in range(4)]
ax.violinplot(data, positions=range(4), showmeans=1, showextrema=1,
showmedians=1)
showmedians=1,
quantiles=[[0.1, 0.9], [0.2, 0.8], [0.3, 0.7], [0.4, 0.6]])


@image_comparison(baseline_images=['violinplot_vert_custompoints_10'],
Expand Down Expand Up @@ -2738,7 +2739,8 @@ def test_horiz_violinplot_showall():
np.random.seed(82762530)
data = [np.random.normal(size=100) for i in range(4)]
ax.violinplot(data, positions=range(4), vert=False, showmeans=1,
showextrema=1, showmedians=1)
showextrema=1, showmedians=1,
quantiles=[[0.1, 0.9], [0.2, 0.8], [0.3, 0.7], [0.4, 0.6]])


@image_comparison(baseline_images=['violinplot_horiz_custompoints_10'],
Expand Down Expand Up @@ -2781,6 +2783,48 @@ def test_violinplot_bad_widths():
ax.violinplot(data, positions=range(4), widths=[1, 2, 3])


def test_violinplot_bad_quantiles():
ax = plt.axes()
# First 9 digits of frac(sqrt(73))
np.random.seed(544003745)
data = [np.random.normal(size=100)]

# Different size quantile list and plots
with pytest.raises(ValueError):
ax.violinplot(data, quantiles=[[0.1, 0.2], [0.5, 0.7]])


def test_violinplot_outofrange_quantiles():
ax = plt.axes()
# First 9 digits of frac(sqrt(79))
np.random.seed(888194417)
data = [np.random.normal(size=100)]

# Quantile value above 100
with pytest.raises(ValueError):
ax.violinplot(data, quantiles=[[0.1, 0.2, 0.3, 1.05]])

# Quantile value below 0
with pytest.raises(ValueError):
ax.violinplot(data, quantiles=[[-0.05, 0.2, 0.3, 0.75]])


@check_figures_equal(extensions=["png"])
def test_violinplot_single_list_quantiles(fig_test, fig_ref):
# Ensures quantile list for 1D can be passed in as single list
# First 9 digits of frac(sqrt(83))
np.random.seed(110433579)
data = [np.random.normal(size=100)]

# Test image
ax = fig_test.subplots()
ax.violinplot(data, quantiles=[0.1, 0.3, 0.9])

# Reference image
ax = fig_ref.subplots()
ax.violinplot(data, quantiles=[[0.1, 0.3, 0.9]])


def test_manage_xticks():
_, ax = plt.subplots()
ax.set_xlim(0, 4)
Expand Down