Skip to content

Commit 5aadd99

Browse files
committed
DOC: MEP12 the histogram examples
1 parent 26e762d commit 5aadd99

5 files changed

+91
-35
lines changed
Lines changed: 41 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,61 @@
1-
"""
2-
Demo of the histogram (hist) function used to plot a cumulative distribution.
1+
"""Demo of using histograms to plot a cumulative distribution.
2+
3+
This shows how to plot a cumulative, normalized histogram as a
4+
step function as means of visualization the empirical cumulative
5+
distribution function (CDF) of a sample. We also use the `mlab`
6+
module to show the theoretical CDF.
7+
8+
A couple of other options to the `hist` function are demostrated.
9+
Namely, we use the `normed` parameter to normalize the histogram and
10+
a couple of different options to the `cumulative` parameter. Normalizing
11+
a histogram means that the counts within each bin are scaled such that
12+
the total height of each bin sum up to 1. Since we're showing the
13+
cumulative histogram, the max value at the end of the series is 1.
14+
The `normed` parameter takes a boolean value.
15+
16+
The `cumulative` kwarg is a little more nuanced. Like `normed`, you can
17+
pass it True or False, but you can also pass it -1 and that will
18+
reverse the distribution. In engineering, CDFs where `cumulative` is
19+
simply True are sometimes "non-excedance" curves. In other words, you
20+
can look at the y-value to set the probability of excedance. For example
21+
the value of 225 on the x-axis corresponse to about 0.85 on the y-axis,
22+
so there's an 85% chance that an observation in the sames does not
23+
exceed 225.
24+
25+
Conversely, setting, `cumulative` to -1 as is done in the last series
26+
for this example, creates a "excedance" curve.
327
428
"""
529
import numpy as np
630
import matplotlib.pyplot as plt
731
from matplotlib import mlab
32+
np.random.seed(0)
833

934

1035
mu = 200
1136
sigma = 25
1237
n_bins = 50
13-
x = mu + sigma*np.random.randn(10000)
38+
x = mu + sigma*np.random.randn(100)
1439

15-
n, bins, patches = plt.hist(x, n_bins, normed=1,
16-
histtype='step', cumulative=True)
40+
n, bins, patches = ax.hist(x, n_bins, normed=1, histtype='step',
41+
cumulative=True, label='Empirical distribution')
1742

1843
# Add a line showing the expected distribution.
1944
y = mlab.normpdf(bins, mu, sigma).cumsum()
2045
y /= y[-1]
21-
plt.plot(bins, y, 'k--', linewidth=1.5)
46+
47+
fig, ax = plt.subplots()
48+
ax.plot(bins, y, 'k--', linewidth=1.5, labe='Theoretical distribution')
2249

2350
# Overlay a reversed cumulative histogram.
24-
plt.hist(x, bins=bins, normed=1, histtype='step', cumulative=-1)
51+
ax.hist(x, bins=bins, normed=1, histtype='step', cumulative=-1,
52+
label='Reversed empirical distribution')
53+
54+
ax.grid(True)
55+
ax.legend(loc='right')
56+
ax.set_title('Cumulative step histograms')
2557

26-
plt.grid(True)
27-
plt.title('cumulative step')
58+
ax.set_xlabel('Annual rainfall (mm)')
59+
ax.set_ylabel('Likelihood of occurance')
2860

2961
plt.show()
Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,35 +1,42 @@
11
"""
22
Demo of the histogram (hist) function with a few features.
33
4-
In addition to the basic histogram, this demo shows a few optional features:
4+
In addition to the basic histogram, this demo shows a few optional
5+
features:
56
67
* Setting the number of data bins
7-
* The ``normed`` flag, which normalizes bin heights so that the integral of
8-
the histogram is 1. The resulting histogram is a probability density.
8+
* The ``normed`` flag, which normalizes bin heights so that the
9+
integral of the histogram is 1. The resulting histogram is an
10+
approximation of the probability density function.
911
* Setting the face color of the bars
1012
* Setting the opacity (alpha value).
11-
1213
"""
14+
1315
import numpy as np
1416
import matplotlib.mlab as mlab
1517
import matplotlib.pyplot as plt
18+
np.random.seed(0)
1619

1720

1821
# example data
1922
mu = 100 # mean of distribution
2023
sigma = 15 # standard deviation of distribution
21-
x = mu + sigma * np.random.randn(10000)
24+
x = mu + sigma * np.random.randn(10)
2225

2326
num_bins = 50
27+
28+
fig, ax = plt.subplots()
29+
2430
# the histogram of the data
25-
n, bins, patches = plt.hist(x, num_bins, normed=1)
31+
n, bins, patches = ax.hist(x, num_bins, normed=1)
32+
2633
# add a 'best fit' line
2734
y = mlab.normpdf(bins, mu, sigma)
28-
plt.plot(bins, y, '--')
29-
plt.xlabel('Smarts')
30-
plt.ylabel('Probability density')
31-
plt.title(r'Histogram of IQ: $\mu=100$, $\sigma=15$')
35+
ax.plot(bins, y, '--')
36+
ax.set_xlabel('Smarts')
37+
ax.set_ylabel('Probability density')
38+
ax.set_title(r'Histogram of IQ: $\mu=100$, $\sigma=15$')
3239

3340
# Tweak spacing to prevent clipping of ylabel
34-
plt.subplots_adjust(left=0.15)
41+
fig.tight_layout()
3542
plt.show()
Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,19 @@
11
"""
2-
Demo of the histogram (hist) function with different ``histtype`` settings.
2+
Demo of the histogram (hist) function with different ``histtype``
3+
settings.
34
45
* Histogram with step curve that has a color fill.
5-
* Histogram with with unequal bin widths.
6+
* Histogram with custom and unequal bin widths.
67
78
"""
89
import numpy as np
910
import matplotlib.pyplot as plt
11+
np.random.seed(0)
1012

1113

1214
mu = 200
1315
sigma = 25
14-
x = mu + sigma*np.random.randn(10000)
16+
x = mu + sigma*np.random.randn(100)
1517

1618
fig, (ax0, ax1) = plt.subplots(ncols=2, figsize=(8, 4))
1719

@@ -23,5 +25,5 @@
2325
ax1.hist(x, bins, normed=1, histtype='bar', rwidth=0.8)
2426
ax1.set_title('unequal bins')
2527

26-
plt.tight_layout()
28+
fig.tight_layout()
2729
plt.show()

examples/statistics/histogram_demo_multihist.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,15 +8,16 @@
88
* Step curve with a color fill
99
* Data sets of different sample sizes
1010
"""
11+
1112
import numpy as np
1213
import matplotlib.pyplot as plt
13-
14+
np.random.seed(0)
1415

1516
n_bins = 10
1617
x = np.random.randn(1000, 3)
1718

1819
fig, axes = plt.subplots(nrows=2, ncols=2)
19-
ax0, ax1, ax2, ax3 = axes.flat
20+
ax0, ax1, ax2, ax3 = axes.flatten()
2021

2122
colors = ['red', 'tan', 'lime']
2223
ax0.hist(x, n_bins, normed=1, histtype='bar', color=colors, label=colors)
@@ -34,5 +35,5 @@
3435
ax3.hist(x_multi, n_bins, histtype='bar')
3536
ax3.set_title('different sample sizes')
3637

37-
plt.tight_layout()
38+
fig.tight_layout()
3839
plt.show()

examples/statistics/multiple_histograms_side_by_side.py

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,36 +1,50 @@
11

2-
"""
3-
Demo of how to produce multiple histograms side by side
2+
"""Demo of how to produce multiple histograms side by side
3+
4+
This example plots horizonal histograms of different samples along
5+
a categorical x-axis. Additionally, the histograms are plotted to
6+
be symmetrical about their x-position, thus making them very similar
7+
to violin plots.
8+
9+
To make this highly specialized plot, we can't use the standard `hist`
10+
method. Instead we use `barh` to draw the horizontal bars directly. The
11+
vertical positions and lengths of the bars are computed via the
12+
`np.histogram` function. The histograms for all the samples are
13+
computed using the same range (min and max values) and number of bins,
14+
so that the bins for each sample are in the same vertical positions.
415
"""
516

617
import numpy as np
718
import matplotlib.pyplot as plt
19+
np.random.seed(0)
820

921
number_of_bins = 20
1022

1123
# An example of three data sets to compare
12-
number_of_data_points = 1000
24+
number_of_data_points = 87
1325
labels = ["A", "B", "C"]
1426
data_sets = [np.random.normal(0, 1, number_of_data_points),
1527
np.random.normal(6, 1, number_of_data_points),
1628
np.random.normal(-3, 1, number_of_data_points)]
1729

1830
# Computed quantities to aid plotting
1931
hist_range = (np.min(data_sets), np.max(data_sets))
20-
binned_data_sets = [np.histogram(d, range=hist_range, bins=number_of_bins)[0]
21-
for d in data_sets]
32+
binned_data_sets = [
33+
np.histogram(d, range=hist_range, bins=number_of_bins)[0]
34+
for d in data_sets
35+
]
2236
binned_maximums = np.max(binned_data_sets, axis=1)
2337
x_locations = np.arange(0, sum(binned_maximums), np.max(binned_maximums))
2438

2539
# The bin_edges are the same for all of the histograms
2640
bin_edges = np.linspace(hist_range[0], hist_range[1], number_of_bins + 1)
27-
centers = .5 * (bin_edges + np.roll(bin_edges, 1))[:-1]
41+
centers = 0.5 * (bin_edges + np.roll(bin_edges, 1))[:-1]
2842
heights = np.diff(bin_edges)
2943

3044
# Cycle through and plot each histogram
31-
ax = plt.subplot(111)
45+
fig, ax = plt.subplots()
3246
for x_loc, binned_data in zip(x_locations, binned_data_sets):
33-
lefts = x_loc - .5 * binned_data
47+
lefts = x_loc - 0.5 * binned_data
3448
ax.barh(centers, binned_data, height=heights, left=lefts)
3549

3650
ax.set_xticks(x_locations)

0 commit comments

Comments
 (0)