DOC: MEP12 the histogram examples

phobson · phobson · commit 5aadd99c6a50 · 2016-10-22T11:32:29.000-07:00
diff --git a/examples/statistics/histogram_demo_cumulative.py b/examples/statistics/histogram_demo_cumulative.py
@@ -1,29 +1,61 @@
-"""
-Demo of the histogram (hist) function used to plot a cumulative distribution.
+"""Demo of using histograms to plot a cumulative distribution.
+
+This shows how to plot a cumulative, normalized histogram as a
+step function as means of visualization the empirical cumulative
+distribution function (CDF) of a sample. We also use the `mlab`
+module to show the theoretical CDF.
+
+A couple of other options to the `hist` function are demostrated.
+Namely, we use the `normed` parameter to normalize the histogram and
+a couple of different options to the `cumulative` parameter. Normalizing
+a histogram means that the counts within each bin are scaled such that
+the total height of each bin sum up to 1. Since we're showing the
+cumulative histogram, the max value at the end of the series is 1.
+The `normed` parameter takes a boolean value.
+
+The `cumulative` kwarg is a little more nuanced. Like `normed`, you can
+pass it True or False, but you can also pass it -1 and that will
+reverse the distribution. In engineering, CDFs where `cumulative` is
+simply True are sometimes "non-excedance" curves. In other words, you
+can look at the y-value to set the probability of excedance. For example
+the value of 225 on the x-axis corresponse to about 0.85 on the y-axis,
+so there's an 85% chance that an observation in the sames does not
+exceed 225.
+
+Conversely, setting, `cumulative` to -1 as is done in the last series
+for this example, creates a "excedance" curve.
 
 """
 import numpy as np
 import matplotlib.pyplot as plt
 from matplotlib import mlab
+np.random.seed(0)
 
 
 mu = 200
 sigma = 25
 n_bins = 50
-x = mu + sigma*np.random.randn(10000)
+x = mu + sigma*np.random.randn(100)
 
-n, bins, patches = plt.hist(x, n_bins, normed=1,
-                            histtype='step', cumulative=True)
+n, bins, patches = ax.hist(x, n_bins, normed=1, histtype='step',
+						   cumulative=True, label='Empirical distribution')
 
 # Add a line showing the expected distribution.
 y = mlab.normpdf(bins, mu, sigma).cumsum()
 y /= y[-1]
-plt.plot(bins, y, 'k--', linewidth=1.5)
+
+fig, ax = plt.subplots()
+ax.plot(bins, y, 'k--', linewidth=1.5, labe='Theoretical distribution')
 
 # Overlay a reversed cumulative histogram.
-plt.hist(x, bins=bins, normed=1, histtype='step', cumulative=-1)
+ax.hist(x, bins=bins, normed=1, histtype='step', cumulative=-1,
+	    label='Reversed empirical distribution')
+
+ax.grid(True)
+ax.legend(loc='right')
+ax.set_title('Cumulative step histograms')
 
-plt.grid(True)
-plt.title('cumulative step')
+ax.set_xlabel('Annual rainfall (mm)')
+ax.set_ylabel('Likelihood of occurance')
 
 plt.show()
diff --git a/examples/statistics/histogram_demo_features.py b/examples/statistics/histogram_demo_features.py
@@ -1,35 +1,42 @@
 """
 Demo of the histogram (hist) function with a few features.
 
-In addition to the basic histogram, this demo shows a few optional features:
+In addition to the basic histogram, this demo shows a few optional
+features:
 
     * Setting the number of data bins
-    * The ``normed`` flag, which normalizes bin heights so that the integral of
-      the histogram is 1. The resulting histogram is a probability density.
+    * The ``normed`` flag, which normalizes bin heights so that the
+      integral of the histogram is 1. The resulting histogram is an
+      approximation of the probability density function.
     * Setting the face color of the bars
     * Setting the opacity (alpha value).
-
 """
+
 import numpy as np
 import matplotlib.mlab as mlab
 import matplotlib.pyplot as plt
+np.random.seed(0)
 
 
 # example data
 mu = 100  # mean of distribution
 sigma = 15  # standard deviation of distribution
-x = mu + sigma * np.random.randn(10000)
+x = mu + sigma * np.random.randn(10)
 
 num_bins = 50
+
+fig, ax = plt.subplots()
+
 # the histogram of the data
-n, bins, patches = plt.hist(x, num_bins, normed=1)
+n, bins, patches = ax.hist(x, num_bins, normed=1)
+
 # add a 'best fit' line
 y = mlab.normpdf(bins, mu, sigma)
-plt.plot(bins, y, '--')
-plt.xlabel('Smarts')
-plt.ylabel('Probability density')
-plt.title(r'Histogram of IQ: $\mu=100$, $\sigma=15$')
+ax.plot(bins, y, '--')
+ax.set_xlabel('Smarts')
+ax.set_ylabel('Probability density')
+ax.set_title(r'Histogram of IQ: $\mu=100$, $\sigma=15$')
 
 # Tweak spacing to prevent clipping of ylabel
-plt.subplots_adjust(left=0.15)
+fig.tight_layout()
 plt.show()
diff --git a/examples/statistics/histogram_demo_histtypes.py b/examples/statistics/histogram_demo_histtypes.py
@@ -1,17 +1,19 @@
 """
-Demo of the histogram (hist) function with different ``histtype`` settings.
+Demo of the histogram (hist) function with different ``histtype``
+settings.
 
 * Histogram with step curve that has a color fill.
-* Histogram with with unequal bin widths.
+* Histogram with custom and unequal bin widths.
 
 """
 import numpy as np
 import matplotlib.pyplot as plt
+np.random.seed(0)
 
 
 mu = 200
 sigma = 25
-x = mu + sigma*np.random.randn(10000)
+x = mu + sigma*np.random.randn(100)
 
 fig, (ax0, ax1) = plt.subplots(ncols=2, figsize=(8, 4))
 
@@ -23,5 +25,5 @@
 ax1.hist(x, bins, normed=1, histtype='bar', rwidth=0.8)
 ax1.set_title('unequal bins')
 
-plt.tight_layout()
+fig.tight_layout()
 plt.show()
diff --git a/examples/statistics/histogram_demo_multihist.py b/examples/statistics/histogram_demo_multihist.py
@@ -8,15 +8,16 @@
     * Step curve with a color fill
     * Data sets of different sample sizes
 """
+
 import numpy as np
 import matplotlib.pyplot as plt
-
+np.random.seed(0)
 
 n_bins = 10
 x = np.random.randn(1000, 3)
 
 fig, axes = plt.subplots(nrows=2, ncols=2)
-ax0, ax1, ax2, ax3 = axes.flat
+ax0, ax1, ax2, ax3 = axes.flatten()
 
 colors = ['red', 'tan', 'lime']
 ax0.hist(x, n_bins, normed=1, histtype='bar', color=colors, label=colors)
@@ -34,5 +35,5 @@
 ax3.hist(x_multi, n_bins, histtype='bar')
 ax3.set_title('different sample sizes')
 
-plt.tight_layout()
+fig.tight_layout()
 plt.show()
diff --git a/examples/statistics/multiple_histograms_side_by_side.py b/examples/statistics/multiple_histograms_side_by_side.py
@@ -1,36 +1,50 @@
 
-"""
-Demo of how to produce multiple histograms side by side
+"""Demo of how to produce multiple histograms side by side
+
+This example plots horizonal histograms of different samples along
+a categorical x-axis. Additionally, the histograms are plotted to
+be symmetrical about their x-position, thus making them very similar
+to violin plots.
+
+To make this highly specialized plot, we can't use the standard `hist`
+method. Instead we use `barh` to draw the horizontal bars directly. The
+vertical positions and lengths of the bars are computed via the
+`np.histogram` function. The histograms for all the samples are
+computed using the same range (min and max values) and number of bins,
+so that the bins for each sample are in the same vertical positions.
 """
 
 import numpy as np
 import matplotlib.pyplot as plt
+np.random.seed(0)
 
 number_of_bins = 20
 
 # An example of three data sets to compare
-number_of_data_points = 1000
+number_of_data_points = 87
 labels = ["A", "B", "C"]
 data_sets = [np.random.normal(0, 1, number_of_data_points),
              np.random.normal(6, 1, number_of_data_points),
              np.random.normal(-3, 1, number_of_data_points)]
 
 # Computed quantities to aid plotting
 hist_range = (np.min(data_sets), np.max(data_sets))
-binned_data_sets = [np.histogram(d, range=hist_range, bins=number_of_bins)[0]
-                    for d in data_sets]
+binned_data_sets = [
+	np.histogram(d, range=hist_range, bins=number_of_bins)[0]
+    for d in data_sets
+]
 binned_maximums = np.max(binned_data_sets, axis=1)
 x_locations = np.arange(0, sum(binned_maximums), np.max(binned_maximums))
 
 # The bin_edges are the same for all of the histograms
 bin_edges = np.linspace(hist_range[0], hist_range[1], number_of_bins + 1)
-centers = .5 * (bin_edges + np.roll(bin_edges, 1))[:-1]
+centers = 0.5 * (bin_edges + np.roll(bin_edges, 1))[:-1]
 heights = np.diff(bin_edges)
 
 # Cycle through and plot each histogram
-ax = plt.subplot(111)
+fig, ax = plt.subplots()
 for x_loc, binned_data in zip(x_locations, binned_data_sets):
-    lefts = x_loc - .5 * binned_data
+    lefts = x_loc - 0.5 * binned_data
     ax.barh(centers, binned_data, height=heights, left=lefts)
 
 ax.set_xticks(x_locations)