Skip to content

DOC: normalizing histograms #27426

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Dec 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions galleries/examples/statistics/hist.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@
axs[0].hist(dist1, bins=n_bins)
axs[1].hist(dist2, bins=n_bins)

plt.show()


# %%
# Updating histogram colors
Expand Down Expand Up @@ -99,8 +101,6 @@
# We can also define custom numbers of bins for each axis
axs[2].hist2d(dist1, dist2, bins=(80, 10), norm=colors.LogNorm())

plt.show()

# %%
#
# .. admonition:: References
Expand Down
60 changes: 0 additions & 60 deletions galleries/examples/statistics/histogram_features.py

This file was deleted.

255 changes: 255 additions & 0 deletions galleries/examples/statistics/histogram_normalization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,255 @@
"""
.. redirect-from:: /gallery/statistics/histogram_features

===================================
Histogram bins, density, and weight
===================================

The `.Axes.hist` method can flexibly create histograms in a few different ways,
which is flexible and helpful, but can also lead to confusion. In particular,
you can:

- bin the data as you want, either with an automatically chosen number of
bins, or with fixed bin edges,
- normalize the histogram so that its integral is one,
- and assign weights to the data points, so that each data point affects the
count in its bin differently.

The Matplotlib ``hist`` method calls `numpy.histogram` and plots the results,
therefore users should consult the numpy documentation for a definitive guide.

Histograms are created by defining bin edges, and taking a dataset of values
and sorting them into the bins, and counting or summing how much data is in
each bin. In this simple example, 9 numbers between 1 and 4 are sorted into 3
bins:
"""

import matplotlib.pyplot as plt
import numpy as np

rng = np.random.default_rng(19680801)

xdata = np.array([1.2, 2.3, 3.3, 3.1, 1.7, 3.4, 2.1, 1.25, 1.3])
xbins = np.array([1, 2, 3, 4])

# changing the style of the histogram bars just to make it
# very clear where the boundaries of the bins are:
style = {'facecolor': 'none', 'edgecolor': 'C0', 'linewidth': 3}

fig, ax = plt.subplots()
ax.hist(xdata, bins=xbins, **style)

# plot the xdata locations on the x axis:
ax.plot(xdata, 0*xdata, 'd')
ax.set_ylabel('Number per bin')
ax.set_xlabel('x bins (dx=1.0)')

# %%
# Modifying bins
# ==============
#
# Changing the bin size changes the shape of this sparse histogram, so its a
# good idea to choose bins with some care with respect to your data. Here we
# make the bins half as wide.

xbins = np.arange(1, 4.5, 0.5)

fig, ax = plt.subplots()
ax.hist(xdata, bins=xbins, **style)
ax.plot(xdata, 0*xdata, 'd')
ax.set_ylabel('Number per bin')
ax.set_xlabel('x bins (dx=0.5)')

# %%
# We can also let numpy (via Matplotlib) choose the bins automatically, or
# specify a number of bins to choose automatically:

fig, ax = plt.subplot_mosaic([['auto', 'n4']],
sharex=True, sharey=True, layout='constrained')

ax['auto'].hist(xdata, **style)
ax['auto'].plot(xdata, 0*xdata, 'd')
ax['auto'].set_ylabel('Number per bin')
ax['auto'].set_xlabel('x bins (auto)')

ax['n4'].hist(xdata, bins=4, **style)
ax['n4'].plot(xdata, 0*xdata, 'd')
ax['n4'].set_xlabel('x bins ("bins=4")')

# %%
# Normalizing histograms: density and weight
# ==========================================
#
# Counts-per-bin is the default length of each bar in the histogram. However,
# we can also normalize the bar lengths as a probability density function using
# the ``density`` parameter:

fig, ax = plt.subplots()
ax.hist(xdata, bins=xbins, density=True, **style)
ax.set_ylabel('Probability density [$V^{-1}$])')
ax.set_xlabel('x bins (dx=0.5 $V$)')

# %%
# This normalization can be a little hard to interpret when just exploring the
# data. The value attached to each bar is divided by the total number of data
# points *and* the width of the bin, and thus the values _integrate_ to one
# when integrating across the full range of data.
# e.g. ::
#
# density = counts / (sum(counts) * np.diff(bins))
# np.sum(density * np.diff(bins)) == 1
#
# This normalization is how `probability density functions
# <https://en.wikipedia.org/wiki/Probability_density_function>`_ are defined in
# statistics. If :math:`X` is a random variable on :math:`x`, then :math:`f_X`
# is is the probability density function if :math:`P[a<X<b] = \int_a^b f_X dx`.
# If the units of x are Volts, then the units of :math:`f_X` are :math:`V^{-1}`
# or probability per change in voltage.
#
# The usefulness of this normalization is a little more clear when we draw from
# a known distribution and try to compare with theory. So, choose 1000 points
# from a `normal distribution
# <https://en.wikipedia.org/wiki/Normal_distribution>`_, and also calculate the
# known probability density function:

xdata = rng.normal(size=1000)
xpdf = np.arange(-4, 4, 0.1)
pdf = 1 / (np.sqrt(2 * np.pi)) * np.exp(-xpdf**2 / 2)

# %%
# If we don't use ``density=True``, we need to scale the expected probability
# distribution function by both the length of the data and the width of the
# bins:

fig, ax = plt.subplot_mosaic([['False', 'True']], layout='constrained')
dx = 0.1
xbins = np.arange(-4, 4, dx)
ax['False'].hist(xdata, bins=xbins, density=False, histtype='step', label='Counts')
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For all the bin width comparisons, it's kind of hard to tell the bin widths from the 'step' type - is there a way to actually show the bins?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With so many bins, vertical lines end up almost merging. Fewer bins, it's hard to see the normal distribution.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What about stacking as rows instead of columns so you have more horizontal space to work in?


# scale and plot the expected pdf:
ax['False'].plot(xpdf, pdf * len(xdata) * dx, label=r'$N\,f_X(x)\,\delta x$')
ax['False'].set_ylabel('Count per bin')
ax['False'].set_xlabel('x bins [V]')
ax['False'].legend()

ax['True'].hist(xdata, bins=xbins, density=True, histtype='step', label='density')
ax['True'].plot(xpdf, pdf, label='$f_X(x)$')
ax['True'].set_ylabel('Probability density [$V^{-1}$]')
ax['True'].set_xlabel('x bins [$V$]')
ax['True'].legend()

# %%
# One advantage of using the density is therefore that the shape and amplitude
# of the histogram does not depend on the size of the bins. Consider an
# extreme case where the bins do not have the same width. In this example, the
# bins below ``x=-1.25`` are six times wider than the rest of the bins. By
# normalizing by density, we preserve the shape of the distribution, whereas if
# we do not, then the wider bins have much higher counts than the thinner bins:

fig, ax = plt.subplot_mosaic([['False', 'True']], layout='constrained')
dx = 0.1
xbins = np.hstack([np.arange(-4, -1.25, 6*dx), np.arange(-1.25, 4, dx)])
ax['False'].hist(xdata, bins=xbins, density=False, histtype='step', label='Counts')
ax['False'].plot(xpdf, pdf * len(xdata) * dx, label=r'$N\,f_X(x)\,\delta x_0$')
ax['False'].set_ylabel('Count per bin')
ax['False'].set_xlabel('x bins [V]')
ax['False'].legend()

ax['True'].hist(xdata, bins=xbins, density=True, histtype='step', label='density')
ax['True'].plot(xpdf, pdf, label='$f_X(x)$')
ax['True'].set_ylabel('Probability density [$V^{-1}$]')
ax['True'].set_xlabel('x bins [$V$]')
ax['True'].legend()

# %%
# Similarly, if we want to compare histograms with different bin widths, we may
# want to use ``density=True``:

fig, ax = plt.subplot_mosaic([['False', 'True']], layout='constrained')

# expected PDF
ax['True'].plot(xpdf, pdf, '--', label='$f_X(x)$', color='k')

for nn, dx in enumerate([0.1, 0.4, 1.2]):
xbins = np.arange(-4, 4, dx)
# expected histogram:
ax['False'].plot(xpdf, pdf*1000*dx, '--', color=f'C{nn}')
ax['False'].hist(xdata, bins=xbins, density=False, histtype='step')

ax['True'].hist(xdata, bins=xbins, density=True, histtype='step', label=dx)

Comment on lines +174 to +180
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you need both here and all three? only because the busyness makes it feel very cluttered in a way where it's hard to read off the lesson

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the main point of why you want to normalize, so comparing and contrasting with and without normalizing is the goal. Multiple bin sizes is to better give the reader an idea of how the bin size affects their results. Sure, it's busy, but I don't think incomprehensibly so.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe same as above, stacking horizontally ? or maybe thinner lines + making the histogram colors paler so that it's easier to visually distinguish?

# Labels:
ax['False'].set_xlabel('x bins [$V$]')
ax['False'].set_ylabel('Count per bin')
ax['True'].set_ylabel('Probability density [$V^{-1}$]')
ax['True'].set_xlabel('x bins [$V$]')
ax['True'].legend(fontsize='small', title='bin width:')

# %%
# Sometimes people want to normalize so that the sum of counts is one. This is
# analogous to a `probability mass function
# <https://en.wikipedia.org/wiki/Probability_mass_function>`_ for a discrete
# variable where the sum of probabilities for all the values equals one. Using
# ``hist``, we can get this normalization if we set the *weights* to 1/N.
# Note that the amplitude of this normalized histogram still depends on
# width and/or number of the bins:

fig, ax = plt.subplots(layout='constrained', figsize=(3.5, 3))

for nn, dx in enumerate([0.1, 0.4, 1.2]):
xbins = np.arange(-4, 4, dx)
ax.hist(xdata, bins=xbins, weights=1/len(xdata) * np.ones(len(xdata)),
histtype='step', label=f'{dx}')
ax.set_xlabel('x bins [$V$]')
ax.set_ylabel('Bin count / N')
ax.legend(fontsize='small', title='bin width:')

# %%
# The value of normalizing histograms is comparing two distributions that have
# different sized populations. Here we compare the distribution of ``xdata``
# with a population of 1000, and ``xdata2`` with 100 members.

xdata2 = rng.normal(size=100)

fig, ax = plt.subplot_mosaic([['no_norm', 'density', 'weight']],
layout='constrained', figsize=(8, 4))

xbins = np.arange(-4, 4, 0.25)

ax['no_norm'].hist(xdata, bins=xbins, histtype='step')
ax['no_norm'].hist(xdata2, bins=xbins, histtype='step')
ax['no_norm'].set_ylabel('Counts')
ax['no_norm'].set_xlabel('x bins [$V$]')
ax['no_norm'].set_title('No normalization')

ax['density'].hist(xdata, bins=xbins, histtype='step', density=True)
ax['density'].hist(xdata2, bins=xbins, histtype='step', density=True)
ax['density'].set_ylabel('Probability density [$V^{-1}$]')
ax['density'].set_title('Density=True')
ax['density'].set_xlabel('x bins [$V$]')

ax['weight'].hist(xdata, bins=xbins, histtype='step',
weights=1 / len(xdata) * np.ones(len(xdata)),
label='N=1000')
ax['weight'].hist(xdata2, bins=xbins, histtype='step',
weights=1 / len(xdata2) * np.ones(len(xdata2)),
label='N=100')
ax['weight'].set_xlabel('x bins [$V$]')
ax['weight'].set_ylabel('Counts / N')
ax['weight'].legend(fontsize='small')
ax['weight'].set_title('Weight = 1/N')

plt.show()

# %%
#
# .. admonition:: References
#
# The use of the following functions, methods, classes and modules is shown
# in this example:
#
# - `matplotlib.axes.Axes.hist` / `matplotlib.pyplot.hist`
# - `matplotlib.axes.Axes.set_title`
# - `matplotlib.axes.Axes.set_xlabel`
# - `matplotlib.axes.Axes.set_ylabel`
# - `matplotlib.axes.Axes.legend`