Skip to content

Commit 977141d

Browse files
committed
Pushing the docs to dev/ for branch: master, commit b97eda5d2ad146a4efdc332a67403908f45a69d1
1 parent dde16a3 commit 977141d

File tree

1,071 files changed

+4238
-3438
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,071 files changed

+4238
-3438
lines changed
2.1 KB
Binary file not shown.
2.05 KB
Binary file not shown.

dev/_downloads/plot_missing_values.ipynb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
"cell_type": "markdown",
1616
"metadata": {},
1717
"source": [
18-
"\n# Imputing missing values before building an estimator\n\n\nThis example shows that imputing the missing values can give better\nresults than discarding the samples containing any missing value.\nImputing does not always improve the predictions, so please check via\ncross-validation. Sometimes dropping rows or using marker values is\nmore effective.\n\nMissing values can be replaced by the mean, the median or the most frequent\nvalue using the ``strategy`` hyper-parameter.\nThe median is a more robust estimator for data with high magnitude variables\nwhich could dominate results (otherwise known as a 'long tail').\n\nScript output::\n\n Score with the entire dataset = 0.56\n Score without the samples containing missing values = 0.48\n Score after imputation of the missing values = 0.55\n\nIn this case, imputing helps the classifier get close to the original score.\n\n\n"
18+
"\n# Imputing missing values before building an estimator\n\n\nMissing values can be replaced by the mean, the median or the most frequent\nvalue using the basic ``SimpleImputer``.\nThe median is a more robust estimator for data with high magnitude variables\nwhich could dominate results (otherwise known as a 'long tail').\n\nAnother option is the MICE imputer. This uses round-robin linear regression,\ntreating every variable as an output in turn. The version implemented assumes\nGaussian (output) variables. If your features are obviously non-Normal,\nconsider transforming them to look more Normal so as to improve performance.\n\n"
1919
]
2020
},
2121
{
@@ -26,7 +26,7 @@
2626
},
2727
"outputs": [],
2828
"source": [
29-
"import numpy as np\n\nfrom sklearn.datasets import load_boston\nfrom sklearn.ensemble import RandomForestRegressor\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.impute import SimpleImputer\nfrom sklearn.model_selection import cross_val_score\n\nrng = np.random.RandomState(0)\n\ndataset = load_boston()\nX_full, y_full = dataset.data, dataset.target\nn_samples = X_full.shape[0]\nn_features = X_full.shape[1]\n\n# Estimate the score on the entire dataset, with no missing values\nestimator = RandomForestRegressor(random_state=0, n_estimators=100)\nscore = cross_val_score(estimator, X_full, y_full).mean()\nprint(\"Score with the entire dataset = %.2f\" % score)\n\n# Add missing values in 75% of the lines\nmissing_rate = 0.75\nn_missing_samples = int(np.floor(n_samples * missing_rate))\nmissing_samples = np.hstack((np.zeros(n_samples - n_missing_samples,\n dtype=np.bool),\n np.ones(n_missing_samples,\n dtype=np.bool)))\nrng.shuffle(missing_samples)\nmissing_features = rng.randint(0, n_features, n_missing_samples)\n\n# Estimate the score without the lines containing missing values\nX_filtered = X_full[~missing_samples, :]\ny_filtered = y_full[~missing_samples]\nestimator = RandomForestRegressor(random_state=0, n_estimators=100)\nscore = cross_val_score(estimator, X_filtered, y_filtered).mean()\nprint(\"Score without the samples containing missing values = %.2f\" % score)\n\n# Estimate the score after imputation of the missing values\nX_missing = X_full.copy()\nX_missing[np.where(missing_samples)[0], missing_features] = 0\ny_missing = y_full.copy()\nestimator = Pipeline([(\"imputer\", SimpleImputer(missing_values=0,\n strategy=\"mean\")),\n (\"forest\", RandomForestRegressor(random_state=0,\n n_estimators=100))])\nscore = cross_val_score(estimator, X_missing, y_missing).mean()\nprint(\"Score after imputation of the missing values = %.2f\" % score)"
29+
"import numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn.datasets import load_diabetes\nfrom sklearn.datasets import load_boston\nfrom sklearn.ensemble import RandomForestRegressor\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.impute import SimpleImputer, MICEImputer\nfrom sklearn.model_selection import cross_val_score\n\nrng = np.random.RandomState(0)\n\n\ndef get_results(dataset):\n X_full, y_full = dataset.data, dataset.target\n n_samples = X_full.shape[0]\n n_features = X_full.shape[1]\n\n # Estimate the score on the entire dataset, with no missing values\n estimator = RandomForestRegressor(random_state=0, n_estimators=100)\n full_scores = cross_val_score(estimator, X_full, y_full,\n scoring='neg_mean_squared_error')\n\n # Add missing values in 75% of the lines\n missing_rate = 0.75\n n_missing_samples = int(np.floor(n_samples * missing_rate))\n missing_samples = np.hstack((np.zeros(n_samples - n_missing_samples,\n dtype=np.bool),\n np.ones(n_missing_samples,\n dtype=np.bool)))\n rng.shuffle(missing_samples)\n missing_features = rng.randint(0, n_features, n_missing_samples)\n\n # Estimate the score after replacing missing values by 0\n X_missing = X_full.copy()\n X_missing[np.where(missing_samples)[0], missing_features] = 0\n y_missing = y_full.copy()\n estimator = RandomForestRegressor(random_state=0, n_estimators=100)\n zero_impute_scores = cross_val_score(estimator, X_missing, y_missing,\n scoring='neg_mean_squared_error')\n\n # Estimate the score after imputation (mean strategy) of the missing values\n X_missing = X_full.copy()\n X_missing[np.where(missing_samples)[0], missing_features] = 0\n y_missing = y_full.copy()\n estimator = Pipeline([(\"imputer\", SimpleImputer(missing_values=0,\n strategy=\"mean\")),\n (\"forest\", RandomForestRegressor(random_state=0,\n n_estimators=100))])\n mean_impute_scores = cross_val_score(estimator, X_missing, y_missing,\n scoring='neg_mean_squared_error')\n\n # Estimate the score after imputation (MICE strategy) of the missing values\n estimator = Pipeline([(\"imputer\", MICEImputer(missing_values=0,\n random_state=0)),\n (\"forest\", RandomForestRegressor(random_state=0,\n n_estimators=100))])\n mice_impute_scores = cross_val_score(estimator, X_missing, y_missing,\n scoring='neg_mean_squared_error')\n\n return ((full_scores.mean(), full_scores.std()),\n (zero_impute_scores.mean(), zero_impute_scores.std()),\n (mean_impute_scores.mean(), mean_impute_scores.std()),\n (mice_impute_scores.mean(), mice_impute_scores.std()))\n\n\nresults_diabetes = np.array(get_results(load_diabetes()))\nmses_diabetes = results_diabetes[:, 0] * -1\nstds_diabetes = results_diabetes[:, 1]\n\nresults_boston = np.array(get_results(load_boston()))\nmses_boston = results_boston[:, 0] * -1\nstds_boston = results_boston[:, 1]\n\nn_bars = len(mses_diabetes)\nxval = np.arange(n_bars)\n\nx_labels = ['Full data',\n 'Zero imputation',\n 'Mean Imputation',\n 'MICE Imputation']\ncolors = ['r', 'g', 'b', 'orange']\n\n# plot diabetes results\nplt.figure(figsize=(12, 6))\nax1 = plt.subplot(121)\nfor j in xval:\n ax1.barh(j, mses_diabetes[j], xerr=stds_diabetes[j],\n color=colors[j], alpha=0.6, align='center')\n\nax1.set_title('Feature Selection Techniques with Diabetes Data')\nax1.set_xlim(left=np.min(mses_diabetes) * 0.9,\n right=np.max(mses_diabetes) * 1.1)\nax1.set_yticks(xval)\nax1.set_xlabel('MSE')\nax1.invert_yaxis()\nax1.set_yticklabels(x_labels)\n\n# plot boston results\nax2 = plt.subplot(122)\nfor j in xval:\n ax2.barh(j, mses_boston[j], xerr=stds_boston[j],\n color=colors[j], alpha=0.6, align='center')\n\nax2.set_title('Feature Selection Techniques with Boston Data')\nax2.set_yticks(xval)\nax2.set_xlabel('MSE')\nax2.invert_yaxis()\nax2.set_yticklabels([''] * n_bars)\n\nplt.show()"
3030
]
3131
}
3232
],

dev/_downloads/plot_missing_values.py

Lines changed: 110 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -1,72 +1,127 @@
11
"""
2-
======================================================
2+
====================================================
33
Imputing missing values before building an estimator
4-
======================================================
5-
6-
This example shows that imputing the missing values can give better
7-
results than discarding the samples containing any missing value.
8-
Imputing does not always improve the predictions, so please check via
9-
cross-validation. Sometimes dropping rows or using marker values is
10-
more effective.
4+
====================================================
115
126
Missing values can be replaced by the mean, the median or the most frequent
13-
value using the ``strategy`` hyper-parameter.
7+
value using the basic ``SimpleImputer``.
148
The median is a more robust estimator for data with high magnitude variables
159
which could dominate results (otherwise known as a 'long tail').
1610
17-
Script output::
18-
19-
Score with the entire dataset = 0.56
20-
Score without the samples containing missing values = 0.48
21-
Score after imputation of the missing values = 0.55
22-
23-
In this case, imputing helps the classifier get close to the original score.
24-
11+
Another option is the MICE imputer. This uses round-robin linear regression,
12+
treating every variable as an output in turn. The version implemented assumes
13+
Gaussian (output) variables. If your features are obviously non-Normal,
14+
consider transforming them to look more Normal so as to improve performance.
2515
"""
16+
2617
import numpy as np
18+
import matplotlib.pyplot as plt
2719

20+
from sklearn.datasets import load_diabetes
2821
from sklearn.datasets import load_boston
2922
from sklearn.ensemble import RandomForestRegressor
3023
from sklearn.pipeline import Pipeline
31-
from sklearn.impute import SimpleImputer
24+
from sklearn.impute import SimpleImputer, MICEImputer
3225
from sklearn.model_selection import cross_val_score
3326

3427
rng = np.random.RandomState(0)
3528

36-
dataset = load_boston()
37-
X_full, y_full = dataset.data, dataset.target
38-
n_samples = X_full.shape[0]
39-
n_features = X_full.shape[1]
40-
41-
# Estimate the score on the entire dataset, with no missing values
42-
estimator = RandomForestRegressor(random_state=0, n_estimators=100)
43-
score = cross_val_score(estimator, X_full, y_full).mean()
44-
print("Score with the entire dataset = %.2f" % score)
45-
46-
# Add missing values in 75% of the lines
47-
missing_rate = 0.75
48-
n_missing_samples = int(np.floor(n_samples * missing_rate))
49-
missing_samples = np.hstack((np.zeros(n_samples - n_missing_samples,
50-
dtype=np.bool),
51-
np.ones(n_missing_samples,
52-
dtype=np.bool)))
53-
rng.shuffle(missing_samples)
54-
missing_features = rng.randint(0, n_features, n_missing_samples)
55-
56-
# Estimate the score without the lines containing missing values
57-
X_filtered = X_full[~missing_samples, :]
58-
y_filtered = y_full[~missing_samples]
59-
estimator = RandomForestRegressor(random_state=0, n_estimators=100)
60-
score = cross_val_score(estimator, X_filtered, y_filtered).mean()
61-
print("Score without the samples containing missing values = %.2f" % score)
62-
63-
# Estimate the score after imputation of the missing values
64-
X_missing = X_full.copy()
65-
X_missing[np.where(missing_samples)[0], missing_features] = 0
66-
y_missing = y_full.copy()
67-
estimator = Pipeline([("imputer", SimpleImputer(missing_values=0,
68-
strategy="mean")),
69-
("forest", RandomForestRegressor(random_state=0,
70-
n_estimators=100))])
71-
score = cross_val_score(estimator, X_missing, y_missing).mean()
72-
print("Score after imputation of the missing values = %.2f" % score)
29+
30+
def get_results(dataset):
31+
X_full, y_full = dataset.data, dataset.target
32+
n_samples = X_full.shape[0]
33+
n_features = X_full.shape[1]
34+
35+
# Estimate the score on the entire dataset, with no missing values
36+
estimator = RandomForestRegressor(random_state=0, n_estimators=100)
37+
full_scores = cross_val_score(estimator, X_full, y_full,
38+
scoring='neg_mean_squared_error')
39+
40+
# Add missing values in 75% of the lines
41+
missing_rate = 0.75
42+
n_missing_samples = int(np.floor(n_samples * missing_rate))
43+
missing_samples = np.hstack((np.zeros(n_samples - n_missing_samples,
44+
dtype=np.bool),
45+
np.ones(n_missing_samples,
46+
dtype=np.bool)))
47+
rng.shuffle(missing_samples)
48+
missing_features = rng.randint(0, n_features, n_missing_samples)
49+
50+
# Estimate the score after replacing missing values by 0
51+
X_missing = X_full.copy()
52+
X_missing[np.where(missing_samples)[0], missing_features] = 0
53+
y_missing = y_full.copy()
54+
estimator = RandomForestRegressor(random_state=0, n_estimators=100)
55+
zero_impute_scores = cross_val_score(estimator, X_missing, y_missing,
56+
scoring='neg_mean_squared_error')
57+
58+
# Estimate the score after imputation (mean strategy) of the missing values
59+
X_missing = X_full.copy()
60+
X_missing[np.where(missing_samples)[0], missing_features] = 0
61+
y_missing = y_full.copy()
62+
estimator = Pipeline([("imputer", SimpleImputer(missing_values=0,
63+
strategy="mean")),
64+
("forest", RandomForestRegressor(random_state=0,
65+
n_estimators=100))])
66+
mean_impute_scores = cross_val_score(estimator, X_missing, y_missing,
67+
scoring='neg_mean_squared_error')
68+
69+
# Estimate the score after imputation (MICE strategy) of the missing values
70+
estimator = Pipeline([("imputer", MICEImputer(missing_values=0,
71+
random_state=0)),
72+
("forest", RandomForestRegressor(random_state=0,
73+
n_estimators=100))])
74+
mice_impute_scores = cross_val_score(estimator, X_missing, y_missing,
75+
scoring='neg_mean_squared_error')
76+
77+
return ((full_scores.mean(), full_scores.std()),
78+
(zero_impute_scores.mean(), zero_impute_scores.std()),
79+
(mean_impute_scores.mean(), mean_impute_scores.std()),
80+
(mice_impute_scores.mean(), mice_impute_scores.std()))
81+
82+
83+
results_diabetes = np.array(get_results(load_diabetes()))
84+
mses_diabetes = results_diabetes[:, 0] * -1
85+
stds_diabetes = results_diabetes[:, 1]
86+
87+
results_boston = np.array(get_results(load_boston()))
88+
mses_boston = results_boston[:, 0] * -1
89+
stds_boston = results_boston[:, 1]
90+
91+
n_bars = len(mses_diabetes)
92+
xval = np.arange(n_bars)
93+
94+
x_labels = ['Full data',
95+
'Zero imputation',
96+
'Mean Imputation',
97+
'MICE Imputation']
98+
colors = ['r', 'g', 'b', 'orange']
99+
100+
# plot diabetes results
101+
plt.figure(figsize=(12, 6))
102+
ax1 = plt.subplot(121)
103+
for j in xval:
104+
ax1.barh(j, mses_diabetes[j], xerr=stds_diabetes[j],
105+
color=colors[j], alpha=0.6, align='center')
106+
107+
ax1.set_title('Feature Selection Techniques with Diabetes Data')
108+
ax1.set_xlim(left=np.min(mses_diabetes) * 0.9,
109+
right=np.max(mses_diabetes) * 1.1)
110+
ax1.set_yticks(xval)
111+
ax1.set_xlabel('MSE')
112+
ax1.invert_yaxis()
113+
ax1.set_yticklabels(x_labels)
114+
115+
# plot boston results
116+
ax2 = plt.subplot(122)
117+
for j in xval:
118+
ax2.barh(j, mses_boston[j], xerr=stds_boston[j],
119+
color=colors[j], alpha=0.6, align='center')
120+
121+
ax2.set_title('Feature Selection Techniques with Boston Data')
122+
ax2.set_yticks(xval)
123+
ax2.set_xlabel('MSE')
124+
ax2.invert_yaxis()
125+
ax2.set_yticklabels([''] * n_bars)
126+
127+
plt.show()

dev/_downloads/scikit-learn-docs.pdf

54 KB
Binary file not shown.
-267 Bytes
-267 Bytes
-189 Bytes
-189 Bytes
-1.22 KB
143 Bytes
-195 Bytes
-195 Bytes
-192 Bytes
-203 Bytes
213 Bytes
-65 Bytes
-180 Bytes
-180 Bytes
27 Bytes
27 Bytes
134 Bytes
134 Bytes
-65 Bytes
-65 Bytes
203 Bytes
203 Bytes
-99 Bytes
-99 Bytes
62 Bytes
62 Bytes
132 Bytes
-178 Bytes
-137 Bytes
-137 Bytes
-711 Bytes
-203 Bytes
-203 Bytes
26.4 KB
-657 Bytes
-148 Bytes
-148 Bytes
-10 Bytes
-35 Bytes

dev/_sources/auto_examples/applications/plot_face_recognition.rst.txt

Lines changed: 21 additions & 21 deletions

0 commit comments

Comments
 (0)