DOC: improve datasets information

GaelVaroquaux · GaelVaroquaux · commit 348d9aa6cab8 · 2011-05-09T15:26:20.000+02:00
Add some links accross the documentation and examples

Add the dataset's description in their docstring
diff --git a/doc/modules/datasets.rst b/doc/modules/datasets.rst
@@ -1,3 +1,5 @@
+.. _datasets:
+
 =========================
 Dataset loading utilities
 =========================
@@ -16,6 +18,21 @@ This package also features helpers to fetch larger datasets commonly
 used by the machine learning community to benchmark algorithm on data
 that comes from the 'real world'.
 
+Datasets shipped with the scikit learn
+========================================
+
+The scikit learn comes with a few standard datasets:
+
+.. autosummary::
+
+   :toctree: generated/
+   :template: function.rst
+
+   load_iris
+   load_diabetes
+   load_digits
+   load_linnerud
+
 
 Dataset generators
 ==================
diff --git a/doc/tutorial.rst b/doc/tutorial.rst
@@ -1,3 +1,5 @@
+.. _getting_started:
+
 Getting started: an introduction to machine learning with scikits.learn
 =======================================================================
 
@@ -67,7 +69,9 @@ the `digits dataset
 A dataset is a dictionary-like object that holds all the data and some
 metadata about the data. This data is stored in the `.data` member, which
 is a `n_samples, n_features` array. In the case of supervised problem,
-explanatory variables are stored in the `.target` member.
+explanatory variables are stored in the `.target` member. More details on
+the different datasets can be found in the 
+:ref:`dedicated section <datasets>`.
 
 For instance, in the case of the digits dataset, `digits.data` gives
 access to the features that can be used to classify the digits samples::
diff --git a/examples/plot_digits_classification.py b/examples/plot_digits_classification.py
@@ -6,6 +6,9 @@
 An example showing how the scikit-learn can be used to recognize images of
 hand-written digits.
 
+This example is commented in the 
+:ref:`tutorial section of the user manual <getting_started>`.
+
 """
 print __doc__
 
@@ -15,13 +18,17 @@
 # Standard scientific Python imports
 import pylab as pl
 
+# Import datasets, classifiers and performance metrics
+from scikits.learn import datasets, svm, metrics
+
 # The digits dataset
-from scikits.learn import datasets
 digits = datasets.load_digits()
 
 # The data that we are interested in is made of 8x8 images of digits,
-# let's have a look at the first 3 images. We know which digit they
-# represent: it is given in the 'target' of the dataset.
+# let's have a look at the first 3 images, stored in the `images`
+# attribute of the dataset. If we were working from image files, we
+# could load them using pylab.imread. For these images know which 
+# digit they represent: it is given in the 'target' of the dataset.
 for index, (image, label) in enumerate(zip(digits.images, digits.target)[:4]):
     pl.subplot(2, 4, index+1)
     pl.imshow(image, cmap=pl.cm.gray_r)
@@ -32,10 +39,7 @@
 n_samples = len(digits.images)
 data = digits.images.reshape((n_samples, -1))
 
-# Import a classifier:
-from scikits.learn import svm
-from scikits.learn.metrics import classification_report
-from scikits.learn.metrics import confusion_matrix
+# Create a classifier: a support vector classifier
 classifier = svm.SVC()
 
 # We learn the digits on the first half of the digits
@@ -45,13 +49,9 @@
 expected = digits.target[n_samples/2:]
 predicted = classifier.predict(data[n_samples/2:])
 
-print "Classification report for classifier:"
-print classifier
-print
-print classification_report(expected, predicted)
-print
-print "Confusion matrix:"
-print confusion_matrix(expected, predicted)
+print "Classification report for classifier %s:\n%s\n" % (
+    classifier, metrics.classification_report(expected, predicted))
+print "Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted)
 
 for index, (image, prediction) in enumerate(
     zip(digits.images[n_samples/2:], predicted)[:4]):
diff --git a/scikits/learn/datasets/base.py b/scikits/learn/datasets/base.py
@@ -9,6 +9,7 @@
 
 import csv
 import shutil
+import textwrap
 from os import environ
 from os.path import dirname
 from os.path import join
@@ -20,6 +21,7 @@
 
 import numpy as np
 
+################################################################################
 
 class Bunch(dict):
     """ Container object for datasets: dictionnary-like object that
@@ -208,7 +210,6 @@ def load_iris():
 def load_digits(n_class=10):
     """load the digits dataset and returns it.
 
-
     Parameters
     ----------
     n_class : integer, between 0 and 10
@@ -256,13 +257,35 @@ def load_digits(n_class=10):
 
 
 def load_diabetes():
+    """ Load the diabetes dataset and returns it.
+
+    Returns
+    -------
+    data : Bunch
+        Dictionnary-like object, the interesting attributes are:
+        'data', the data to learn and 'target', the labels for each
+        sample.
+
+
+    """
     base_dir = join(dirname(__file__), 'data')
     data = np.loadtxt(join(base_dir, 'diabetes_data.csv.gz'))
     target = np.loadtxt(join(base_dir, 'diabetes_target.csv.gz'))
     return Bunch(data=data, target=target)
 
 
 def load_linnerud():
+    """ Load the linnerud dataset and returns it.
+
+    Returns
+    -------
+    data : Bunch
+        Dictionnary-like object, the interesting attributes are:
+        'data_exercise' and 'data_physiological', the two multivariate
+        datasets, as well as 'header_exercise' and
+        'header_physiological', the corresponding headers.
+
+    """
     base_dir = join(dirname(__file__), 'data/')
     # Read data
     data_exercise = np.loadtxt(base_dir + 'linnerud_exercise.csv', skiprows=1)
@@ -280,3 +303,29 @@ def load_linnerud():
                  data_physiological=data_physiological,
                  header_physiological=header_physiological,
                  DESCR=fdescr.read())
+
+################################################################################
+# Add the description in the docstring
+
+def _add_notes(function, filename):
+    """Add a notes section to the docstring of a function reading it from a
+    file"""
+    fdescr = open(join(dirname(__file__), 'descr', filename), 'r')
+    # Dedent the docstring
+    doc = function.__doc__.split('\n')
+    doc = '%s\n%s' % (textwrap.dedent(doc[0]),
+                      textwrap.dedent('\n'.join(doc[1:])))
+    # Remove the first line of the description, which contains the
+    # dataset's name
+    descr = '\n'.join(fdescr.read().split('\n')[1:])
+    function.__doc__ = doc + descr
+
+
+for function, filename in ((load_iris, 'iris.rst'),
+                           (load_linnerud, 'linnerud.rst'), 
+                           (load_digits, 'digits.rst')):
+    #try:
+        _add_notes(function, filename)
+    #except:
+    #    pass
+
diff --git a/scikits/learn/datasets/descr/digits.rst b/scikits/learn/datasets/descr/digits.rst
@@ -1,12 +1,22 @@
  Optical Recognition of Handwritten Digits Data Set
 
-Data Set Characteristics:  
-        
-Source
+
+Notes
 -------
 
-Creator: E. Alpaydin (alpaydin '@' boun.edu.tr)
-Date: July; 1998
+Data Set Characteristics:  
+
+    :Number of Instances: 5620
+
+    :Number of Attributes: 64
+
+    :Attribute Information: 8x8 image of integer pixels in the range 0..16.
+
+    :Missing Attribute Values: None 
+
+    :Creator: E. Alpaydin (alpaydin '@' boun.edu.tr)
+    
+    :Date: July; 1998
 
 This is a copy of the test set of the UCI ML hand-written digits datasets 
 
@@ -44,12 +54,4 @@ References
   - ...
 
 
-Number of Instances: 5620
-
-Number of Attributes: 64
-
-Attribute Information: 8x8 image of integer pixels in the range 0..16.
-
-Missing Attribute Values: None 
-
 
diff --git a/scikits/learn/datasets/descr/iris.rst b/scikits/learn/datasets/descr/iris.rst
@@ -1,10 +1,40 @@
 Iris Plants Database
 
-Source
+Notes
 ------
-Creator: R.A. Fisher 
-Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
-Date: July, 1988
+Data Set Characteristics:  
+
+    :Number of Instances: 150 (50 in each of three classes)
+
+    :Number of Attributes: 4 numeric, predictive attributes and the class
+
+    :Attribute Information:
+        - sepal length in cm
+        - sepal width in cm
+        - petal length in cm
+        - petal width in cm
+        - class: 
+                - Iris-Setosa
+                - Iris-Versicolour
+                - Iris-Virginica
+
+    :Summary Statistics: 
+    ============== ==== ==== ======= ===== ====================
+                    Min  Max   Mean    SD   Class Correlation
+    ============== ==== ==== ======= ===== ====================
+    sepal length:   4.3  7.9   5.84   0.83    0.7826   
+    sepal width:    2.0  4.4   3.05   0.43   -0.4194
+    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
+    petal width:    0.1  2.5   1.20  0.76     0.9565  (high!)
+    ============== ==== ==== ======= ===== ====================
+
+    :Missing Attribute Values: None
+
+    :Class Distribution: 33.3% for each of 3 classes.
+
+    :Creator: R.A. Fisher 
+    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
+    :Date: July, 1988
 
 This is a copy of UCI ML iris datasets.
 
@@ -37,28 +67,4 @@ References
    - Many, many more ...
 
 
-Number of Instances: 150 (50 in each of three classes)
-
-Number of Attributes: 4 numeric, predictive attributes and the class
-
-Attribute Information:
-   - sepal length in cm
-   - sepal width in cm
-   - petal length in cm
-   - petal width in cm
-   - class: 
-        - Iris-Setosa
-        - Iris-Versicolour
-        - Iris-Virginica
-
-Summary Statistics:
-                 Min  Max   Mean    SD   Class Correlation
-   sepal length: 4.3  7.9   5.84  0.83    0.7826   
-    sepal width: 2.0  4.4   3.05  0.43   -0.4194
-   petal length: 1.0  6.9   3.76  1.76    0.9490  (high!)
-    petal width: 0.1  2.5   1.20  0.76    0.9565  (high!)
-
-Missing Attribute Values: None
-
-Class Distribution: 33.3% for each of 3 classes.
 
diff --git a/scikits/learn/datasets/descr/linnerud.rst b/scikits/learn/datasets/descr/linnerud.rst
@@ -1,3 +1,11 @@
+
+Notes
+------
+
+:Number of Instances: 20
+:Number of Attributes: 3
+:Missing Attribute Values: None 
+
 The Linnerud dataset constains two small dataset:
 
 - *exercise* A list containing the following components: exercise data with
@@ -6,19 +14,11 @@ The Linnerud dataset constains two small dataset:
 - *physiological* data frame with 20 observations on 3 physiological variables:
    Chins, Situps and Jumps
 
-Source
-------
-
-Tenenhaus, M. (1998), Table 1, page 15.
+**Source:** Tenenhaus, M. (1998), Table 1, page 15.
 
 References
 ----------
 
-Tenenhaus, M. (1998). La regression PLS: theorie et pratique. Paris: Editions Technic.
-
-Number of Instances: 20
-
-Number of Attributes: 3
+* Tenenhaus, M. (1998). La regression PLS: theorie et pratique. Paris: Editions Technic.
 
-Missing Attribute Values: None