partev
diff --git a/‎dev/.buildinfo
Lines changed: 1 addition & 1 deletion b/‎dev/.buildinfo
Lines changed: 1 addition & 1 deletion
diff --git a/‎dev/_downloads/05ca8a4e90b4cc2acd69f9e24b4a1f3a/plot_classifier_chain_yeast.ipynb
Lines changed: 81 additions & 2 deletions b/‎dev/_downloads/05ca8a4e90b4cc2acd69f9e24b4a1f3a/plot_classifier_chain_yeast.ipynb
Lines changed: 81 additions & 2 deletions
diff --git a/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
1.65 KB b/‎dev/_downloads/07fcc19ba03226cd3d83d4e40ec44385/auto_examples_python.zip
1.65 KB
diff --git a/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
2.61 KB b/‎dev/_downloads/6f1e7a639e0699d6164445b55e6c116d/auto_examples_jupyter.zip
2.61 KB
diff --git a/‎dev/_downloads/c6856243c97f58098e60fb14d2bf3750/plot_classifier_chain_yeast.py
Lines changed: 87 additions & 47 deletions b/‎dev/_downloads/c6856243c97f58098e60fb14d2bf3750/plot_classifier_chain_yeast.py
Lines changed: 87 additions & 47 deletions
diff --git a/‎dev/_downloads/scikit-learn-docs.zip
13.9 KB b/‎dev/_downloads/scikit-learn-docs.zip
13.9 KB
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
-206 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_001.png
-206 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_002.png
-210 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_002.png
-210 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_003.png
75 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_003.png
75 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_004.png
181 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_004.png
181 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_agglomerative_clustering_thumb.png
-84 Bytes b/‎dev/_images/sphx_glr_plot_agglomerative_clustering_thumb.png
-84 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_anomaly_comparison_001.png
-319 Bytes b/‎dev/_images/sphx_glr_plot_anomaly_comparison_001.png
-319 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_anomaly_comparison_thumb.png
-16 Bytes b/‎dev/_images/sphx_glr_plot_anomaly_comparison_thumb.png
-16 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_caching_nearest_neighbors_001.png
-202 Bytes b/‎dev/_images/sphx_glr_plot_caching_nearest_neighbors_001.png
-202 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_caching_nearest_neighbors_thumb.png
-147 Bytes b/‎dev/_images/sphx_glr_plot_caching_nearest_neighbors_thumb.png
-147 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_classifier_chain_yeast_001.png
-3 Bytes b/‎dev/_images/sphx_glr_plot_classifier_chain_yeast_001.png
-3 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_classifier_chain_yeast_thumb.png
67 Bytes b/‎dev/_images/sphx_glr_plot_classifier_chain_yeast_thumb.png
67 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_cluster_comparison_001.png
453 Bytes b/‎dev/_images/sphx_glr_plot_cluster_comparison_001.png
453 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_cluster_comparison_thumb.png
-21 Bytes b/‎dev/_images/sphx_glr_plot_cluster_comparison_thumb.png
-21 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_coin_segmentation_001.png
61 Bytes b/‎dev/_images/sphx_glr_plot_coin_segmentation_001.png
61 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_coin_segmentation_002.png
52 Bytes b/‎dev/_images/sphx_glr_plot_coin_segmentation_002.png
52 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_coin_segmentation_003.png
-59 Bytes b/‎dev/_images/sphx_glr_plot_coin_segmentation_003.png
-59 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_coin_segmentation_thumb.png
-17 Bytes b/‎dev/_images/sphx_glr_plot_coin_segmentation_thumb.png
-17 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_compare_calibration_001.png
526 Bytes b/‎dev/_images/sphx_glr_plot_compare_calibration_001.png
526 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_compare_calibration_thumb.png
58 Bytes b/‎dev/_images/sphx_glr_plot_compare_calibration_thumb.png
58 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_dict_face_patches_001.png
-16 Bytes b/‎dev/_images/sphx_glr_plot_dict_face_patches_001.png
-16 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_dict_face_patches_thumb.png
1 Byte b/‎dev/_images/sphx_glr_plot_dict_face_patches_thumb.png
1 Byte
diff --git a/‎dev/_images/sphx_glr_plot_digits_pipe_001.png
11 Bytes b/‎dev/_images/sphx_glr_plot_digits_pipe_001.png
11 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_digits_pipe_thumb.png
51 Bytes b/‎dev/_images/sphx_glr_plot_digits_pipe_thumb.png
51 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_document_classification_20newsgroups_005.png
2 Bytes b/‎dev/_images/sphx_glr_plot_document_classification_20newsgroups_005.png
2 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_document_classification_20newsgroups_006.png
-157 Bytes b/‎dev/_images/sphx_glr_plot_document_classification_20newsgroups_006.png
-157 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_document_clustering_001.png
2 Bytes b/‎dev/_images/sphx_glr_plot_document_clustering_001.png
2 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_document_clustering_thumb.png
-8 Bytes b/‎dev/_images/sphx_glr_plot_document_clustering_thumb.png
-8 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_gmm_init_001.png
11 Bytes b/‎dev/_images/sphx_glr_plot_gmm_init_001.png
11 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_gmm_init_thumb.png
-3 Bytes b/‎dev/_images/sphx_glr_plot_gmm_init_thumb.png
-3 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_gradient_boosting_categorical_001.png
1016 Bytes b/‎dev/_images/sphx_glr_plot_gradient_boosting_categorical_001.png
1016 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_gradient_boosting_categorical_002.png
-136 Bytes b/‎dev/_images/sphx_glr_plot_gradient_boosting_categorical_002.png
-136 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_gradient_boosting_categorical_thumb.png
-465 Bytes b/‎dev/_images/sphx_glr_plot_gradient_boosting_categorical_thumb.png
-465 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_gradient_boosting_early_stopping_002.png
-688 Bytes b/‎dev/_images/sphx_glr_plot_gradient_boosting_early_stopping_002.png
-688 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_hashing_vs_dict_vectorizer_001.png
72 Bytes b/‎dev/_images/sphx_glr_plot_hashing_vs_dict_vectorizer_001.png
72 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_hashing_vs_dict_vectorizer_002.png
-5 Bytes b/‎dev/_images/sphx_glr_plot_hashing_vs_dict_vectorizer_002.png
-5 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_hashing_vs_dict_vectorizer_thumb.png
14 Bytes b/‎dev/_images/sphx_glr_plot_hashing_vs_dict_vectorizer_thumb.png
14 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_image_denoising_002.png
23 Bytes b/‎dev/_images/sphx_glr_plot_image_denoising_002.png
23 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_image_denoising_003.png
-110 Bytes b/‎dev/_images/sphx_glr_plot_image_denoising_003.png
-110 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_image_denoising_005.png
114 Bytes b/‎dev/_images/sphx_glr_plot_image_denoising_005.png
114 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_kernel_approximation_001.png
-509 Bytes b/‎dev/_images/sphx_glr_plot_kernel_approximation_001.png
-509 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_kernel_approximation_thumb.png
-314 Bytes b/‎dev/_images/sphx_glr_plot_kernel_approximation_thumb.png
-314 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_kernel_ridge_regression_001.png
-447 Bytes b/‎dev/_images/sphx_glr_plot_kernel_ridge_regression_001.png
-447 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_kernel_ridge_regression_002.png
758 Bytes b/‎dev/_images/sphx_glr_plot_kernel_ridge_regression_002.png
758 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_kernel_ridge_regression_thumb.png
-4 Bytes b/‎dev/_images/sphx_glr_plot_kernel_ridge_regression_thumb.png
-4 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_lasso_model_selection_002.png
62 Bytes b/‎dev/_images/sphx_glr_plot_lasso_model_selection_002.png
62 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_lasso_model_selection_003.png
21 Bytes b/‎dev/_images/sphx_glr_plot_lasso_model_selection_003.png
21 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_learning_curve_002.png
4.68 KB b/‎dev/_images/sphx_glr_plot_learning_curve_002.png
4.68 KB
diff --git a/‎dev/_images/sphx_glr_plot_learning_curve_003.png
1.47 KB b/‎dev/_images/sphx_glr_plot_learning_curve_003.png
1.47 KB
diff --git a/‎dev/_images/sphx_glr_plot_linear_model_coefficient_interpretation_001.png
421 Bytes b/‎dev/_images/sphx_glr_plot_linear_model_coefficient_interpretation_001.png
421 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_linear_model_coefficient_interpretation_thumb.png
-57 Bytes b/‎dev/_images/sphx_glr_plot_linear_model_coefficient_interpretation_thumb.png
-57 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_linkage_comparison_001.png
-6 Bytes b/‎dev/_images/sphx_glr_plot_linkage_comparison_001.png
-6 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_linkage_comparison_thumb.png
-1 Bytes b/‎dev/_images/sphx_glr_plot_linkage_comparison_thumb.png
-1 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_lle_digits_003.png
45 Bytes b/‎dev/_images/sphx_glr_plot_lle_digits_003.png
45 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_lle_digits_005.png
-117 Bytes b/‎dev/_images/sphx_glr_plot_lle_digits_005.png
-117 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_lle_digits_006.png
-185 Bytes b/‎dev/_images/sphx_glr_plot_lle_digits_006.png
-185 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_lle_digits_007.png
135 Bytes b/‎dev/_images/sphx_glr_plot_lle_digits_007.png
135 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_lle_digits_008.png
30 Bytes b/‎dev/_images/sphx_glr_plot_lle_digits_008.png
30 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_lle_digits_009.png
-62 Bytes b/‎dev/_images/sphx_glr_plot_lle_digits_009.png
-62 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_lle_digits_010.png
108 Bytes b/‎dev/_images/sphx_glr_plot_lle_digits_010.png
108 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_lle_digits_011.png
28 Bytes b/‎dev/_images/sphx_glr_plot_lle_digits_011.png
28 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_lle_digits_012.png
-8 Bytes b/‎dev/_images/sphx_glr_plot_lle_digits_012.png
-8 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_lle_digits_013.png
-104 Bytes b/‎dev/_images/sphx_glr_plot_lle_digits_013.png
-104 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_lle_digits_014.png
2 Bytes b/‎dev/_images/sphx_glr_plot_lle_digits_014.png
2 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_manifold_sphere_001.png
24 Bytes b/‎dev/_images/sphx_glr_plot_manifold_sphere_001.png
24 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_manifold_sphere_thumb.png
-19 Bytes b/‎dev/_images/sphx_glr_plot_manifold_sphere_thumb.png
-19 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_model_complexity_influence_001.png
-408 Bytes b/‎dev/_images/sphx_glr_plot_model_complexity_influence_001.png
-408 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_model_complexity_influence_002.png
1.07 KB b/‎dev/_images/sphx_glr_plot_model_complexity_influence_002.png
1.07 KB
diff --git a/‎dev/_images/sphx_glr_plot_model_complexity_influence_003.png
-576 Bytes b/‎dev/_images/sphx_glr_plot_model_complexity_influence_003.png
-576 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_model_complexity_influence_thumb.png
-345 Bytes b/‎dev/_images/sphx_glr_plot_model_complexity_influence_thumb.png
-345 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_multiclass_overview_001.png
-246 Bytes b/‎dev/_images/sphx_glr_plot_multiclass_overview_001.png
-246 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_multiclass_overview_002.png
-160 Bytes b/‎dev/_images/sphx_glr_plot_multiclass_overview_002.png
-160 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_multiclass_overview_thumb.png
-129 Bytes b/‎dev/_images/sphx_glr_plot_multiclass_overview_thumb.png
-129 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_out_of_core_classification_002.png
-271 Bytes b/‎dev/_images/sphx_glr_plot_out_of_core_classification_002.png
-271 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_out_of_core_classification_003.png
-897 Bytes b/‎dev/_images/sphx_glr_plot_out_of_core_classification_003.png
-897 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_out_of_core_classification_004.png
-110 Bytes b/‎dev/_images/sphx_glr_plot_out_of_core_classification_004.png
-110 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_prediction_latency_001.png
-601 Bytes b/‎dev/_images/sphx_glr_plot_prediction_latency_001.png
-601 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_prediction_latency_002.png
177 Bytes b/‎dev/_images/sphx_glr_plot_prediction_latency_002.png
177 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_prediction_latency_003.png
216 Bytes b/‎dev/_images/sphx_glr_plot_prediction_latency_003.png
216 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_prediction_latency_004.png
678 Bytes b/‎dev/_images/sphx_glr_plot_prediction_latency_004.png
678 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_prediction_latency_thumb.png
-26 Bytes b/‎dev/_images/sphx_glr_plot_prediction_latency_thumb.png
-26 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_release_highlights_0_24_0_001.png
1.93 KB b/‎dev/_images/sphx_glr_plot_release_highlights_0_24_0_001.png
1.93 KB
diff --git a/‎dev/_images/sphx_glr_plot_release_highlights_0_24_0_thumb.png
565 Bytes b/‎dev/_images/sphx_glr_plot_release_highlights_0_24_0_thumb.png
565 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_scalable_poly_kernels_001.png
-26 Bytes b/‎dev/_images/sphx_glr_plot_scalable_poly_kernels_001.png
-26 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_scalable_poly_kernels_thumb.png
52 Bytes b/‎dev/_images/sphx_glr_plot_scalable_poly_kernels_thumb.png
52 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_sgd_early_stopping_002.png
-1.14 KB b/‎dev/_images/sphx_glr_plot_sgd_early_stopping_002.png
-1.14 KB
diff --git a/‎dev/_images/sphx_glr_plot_sparse_logistic_regression_20newsgroups_001.png
313 Bytes b/‎dev/_images/sphx_glr_plot_sparse_logistic_regression_20newsgroups_001.png
313 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_sparse_logistic_regression_20newsgroups_thumb.png
61 Bytes b/‎dev/_images/sphx_glr_plot_sparse_logistic_regression_20newsgroups_thumb.png
61 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_stack_predictors_001.png
115 Bytes b/‎dev/_images/sphx_glr_plot_stack_predictors_001.png
115 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_stack_predictors_thumb.png
33 Bytes b/‎dev/_images/sphx_glr_plot_stack_predictors_thumb.png
33 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_successive_halving_heatmap_001.png
66 Bytes b/‎dev/_images/sphx_glr_plot_successive_halving_heatmap_001.png
66 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_successive_halving_heatmap_thumb.png
34 Bytes b/‎dev/_images/sphx_glr_plot_successive_halving_heatmap_thumb.png
34 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_svm_scale_c_001.png
35 Bytes b/‎dev/_images/sphx_glr_plot_svm_scale_c_001.png
35 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_svm_scale_c_thumb.png
-2 Bytes b/‎dev/_images/sphx_glr_plot_svm_scale_c_thumb.png
-2 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_theilsen_001.png
22 Bytes b/‎dev/_images/sphx_glr_plot_theilsen_001.png
22 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_theilsen_002.png
-21 Bytes b/‎dev/_images/sphx_glr_plot_theilsen_002.png
-21 Bytes
diff --git a/‎dev/_images/sphx_glr_plot_theilsen_thumb.png
0 Bytes b/‎dev/_images/sphx_glr_plot_theilsen_thumb.png
0 Bytes
@@ -1,4 +1,4 @@
 # Sphinx build info version 1
 # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
-config: 2dc85a82fd50b8bc03c0661f70c07bfc
+config: 1a20111001535570e5c0fe3cd42d43d9
 tags: 645f666f9bcd5a90fca523b33c5a78b7
@@ -4,7 +4,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "\n# Classifier Chain\nExample of using classifier chain on a multilabel dataset.\n\nFor this example we will use the [yeast](https://www.openml.org/d/40597) dataset which contains\n2417 datapoints each with 103 features and 14 possible labels. Each\ndata point has at least one label. As a baseline we first train a logistic\nregression classifier for each of the 14 labels. To evaluate the performance of\nthese classifiers we predict on a held-out test set and calculate the\n`jaccard score <jaccard_similarity_score>` for each sample.\n\nNext we create 10 classifier chains. Each classifier chain contains a\nlogistic regression model for each of the 14 labels. The models in each\nchain are ordered randomly. In addition to the 103 features in the dataset,\neach model gets the predictions of the preceding models in the chain as\nfeatures (note that by default at training time each model gets the true\nlabels as features). These additional features allow each chain to exploit\ncorrelations among the classes. The Jaccard similarity score for each chain\ntends to be greater than that of the set independent logistic models.\n\nBecause the models in each chain are arranged randomly there is significant\nvariation in performance among the chains. Presumably there is an optimal\nordering of the classes in a chain that will yield the best performance.\nHowever we do not know that ordering a priori. Instead we can construct an\nvoting ensemble of classifier chains by averaging the binary predictions of\nthe chains and apply a threshold of 0.5. The Jaccard similarity score of the\nensemble is greater than that of the independent models and tends to exceed\nthe score of each chain in the ensemble (although this is not guaranteed\nwith randomly ordered chains).\n"
+        "\n# Multilabel classification using a classifier chain\nThis example shows how to use :class:`~sklearn.multioutput.ClassifierChain` to solve\na multilabel classification problem.\n\nThe most naive strategy to solve such a task is to independently train a binary\nclassifier on each label (i.e. each column of the target variable). At prediction\ntime, the ensemble of binary classifiers is used to assemble multitask prediction.\n\nThis strategy does not allow to model relationship between different tasks. The\n:class:`~sklearn.multioutput.ClassifierChain` is the meta-estimator (i.e. an estimator\ntaking an inner estimator) that implements a more advanced strategy. The ensemble\nof binary classifiers are used as a chain where the prediction of a classifier in the\nchain is used as a feature for training the next classifier on a new label. Therefore,\nthese additional features allow each chain to exploit correlations among labels.\n\nThe `Jaccard similarity <jaccard_similarity_score>` score for chain tends to be\ngreater than that of the set independent base models.\n"
       ]
     },
     {
@@ -15,7 +15,86 @@
       },
       "outputs": [],
       "source": [
-        "# Author: Adam Kleczewski\n# License: BSD 3 clause\n\nimport matplotlib.pyplot as plt\nimport numpy as np\n\nfrom sklearn.datasets import fetch_openml\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.metrics import jaccard_score\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.multiclass import OneVsRestClassifier\nfrom sklearn.multioutput import ClassifierChain\n\n# Load a multi-label dataset from https://www.openml.org/d/40597\nX, Y = fetch_openml(\"yeast\", version=4, return_X_y=True, parser=\"pandas\")\nY = Y == \"TRUE\"\nX_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)\n\n# Fit an independent logistic regression model for each class using the\n# OneVsRestClassifier wrapper.\nbase_lr = LogisticRegression()\novr = OneVsRestClassifier(base_lr)\novr.fit(X_train, Y_train)\nY_pred_ovr = ovr.predict(X_test)\novr_jaccard_score = jaccard_score(Y_test, Y_pred_ovr, average=\"samples\")\n\n# Fit an ensemble of logistic regression classifier chains and take the\n# take the average prediction of all the chains.\nchains = [ClassifierChain(base_lr, order=\"random\", random_state=i) for i in range(10)]\nfor chain in chains:\n    chain.fit(X_train, Y_train)\n\nY_pred_chains = np.array([chain.predict(X_test) for chain in chains])\nchain_jaccard_scores = [\n    jaccard_score(Y_test, Y_pred_chain >= 0.5, average=\"samples\")\n    for Y_pred_chain in Y_pred_chains\n]\n\nY_pred_ensemble = Y_pred_chains.mean(axis=0)\nensemble_jaccard_score = jaccard_score(\n    Y_test, Y_pred_ensemble >= 0.5, average=\"samples\"\n)\n\nmodel_scores = [ovr_jaccard_score] + chain_jaccard_scores\nmodel_scores.append(ensemble_jaccard_score)\n\nmodel_names = (\n    \"Independent\",\n    \"Chain 1\",\n    \"Chain 2\",\n    \"Chain 3\",\n    \"Chain 4\",\n    \"Chain 5\",\n    \"Chain 6\",\n    \"Chain 7\",\n    \"Chain 8\",\n    \"Chain 9\",\n    \"Chain 10\",\n    \"Ensemble\",\n)\n\nx_pos = np.arange(len(model_names))\n\n# Plot the Jaccard similarity scores for the independent model, each of the\n# chains, and the ensemble (note that the vertical axis on this plot does\n# not begin at 0).\n\nfig, ax = plt.subplots(figsize=(7, 4))\nax.grid(True)\nax.set_title(\"Classifier Chain Ensemble Performance Comparison\")\nax.set_xticks(x_pos)\nax.set_xticklabels(model_names, rotation=\"vertical\")\nax.set_ylabel(\"Jaccard Similarity Score\")\nax.set_ylim([min(model_scores) * 0.9, max(model_scores) * 1.1])\ncolors = [\"r\"] + [\"b\"] * len(chain_jaccard_scores) + [\"g\"]\nax.bar(x_pos, model_scores, alpha=0.5, color=colors)\nplt.tight_layout()\nplt.show()"
+        "# Author: Adam Kleczewski\n# License: BSD 3 clause"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Loading a dataset\nFor this example, we use the [yeast](https://www.openml.org/d/40597) dataset which contains\n2,417 datapoints each with 103 features and 14 possible labels. Each\ndata point has at least one label. As a baseline we first train a logistic\nregression classifier for each of the 14 labels. To evaluate the performance of\nthese classifiers we predict on a held-out test set and calculate the\nJaccard similarity for each sample.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "import matplotlib.pyplot as plt\nimport numpy as np\n\nfrom sklearn.datasets import fetch_openml\nfrom sklearn.model_selection import train_test_split\n\n# Load a multi-label dataset from https://www.openml.org/d/40597\nX, Y = fetch_openml(\"yeast\", version=4, return_X_y=True, parser=\"pandas\")\nY = Y == \"TRUE\"\nX_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Fit models\nWe fit :class:`~sklearn.linear_model.LogisticRegression` wrapped by\n:class:`~sklearn.multiclass.OneVsRestClassifier` and ensemble of multiple\n:class:`~sklearn.multioutput.ClassifierChain`.\n\n### LogisticRegression wrapped by OneVsRestClassifier\nSince by default :class:`~sklearn.linear_model.LogisticRegression` can't\nhandle data with multiple targets, we need to use\n:class:`~sklearn.multiclass.OneVsRestClassifier`.\nAfter fitting the model we calculate Jaccard similarity.\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "from sklearn.linear_model import LogisticRegression\nfrom sklearn.metrics import jaccard_score\nfrom sklearn.multiclass import OneVsRestClassifier\n\nbase_lr = LogisticRegression()\novr = OneVsRestClassifier(base_lr)\novr.fit(X_train, Y_train)\nY_pred_ovr = ovr.predict(X_test)\novr_jaccard_score = jaccard_score(Y_test, Y_pred_ovr, average=\"samples\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "### Chain of binary classifiers\nBecause the models in each chain are arranged randomly there is significant\nvariation in performance among the chains. Presumably there is an optimal\nordering of the classes in a chain that will yield the best performance.\nHowever, we do not know that ordering a priori. Instead, we can build a\nvoting ensemble of classifier chains by averaging the binary predictions of\nthe chains and apply a threshold of 0.5. The Jaccard similarity score of the\nensemble is greater than that of the independent models and tends to exceed\nthe score of each chain in the ensemble (although this is not guaranteed\nwith randomly ordered chains).\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "from sklearn.multioutput import ClassifierChain\n\nchains = [ClassifierChain(base_lr, order=\"random\", random_state=i) for i in range(10)]\nfor chain in chains:\n    chain.fit(X_train, Y_train)\n\nY_pred_chains = np.array([chain.predict_proba(X_test) for chain in chains])\nchain_jaccard_scores = [\n    jaccard_score(Y_test, Y_pred_chain >= 0.5, average=\"samples\")\n    for Y_pred_chain in Y_pred_chains\n]\n\nY_pred_ensemble = Y_pred_chains.mean(axis=0)\nensemble_jaccard_score = jaccard_score(\n    Y_test, Y_pred_ensemble >= 0.5, average=\"samples\"\n)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Plot results\nPlot the Jaccard similarity scores for the independent model, each of the\nchains, and the ensemble (note that the vertical axis on this plot does\nnot begin at 0).\n\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "model_scores = [ovr_jaccard_score] + chain_jaccard_scores + [ensemble_jaccard_score]\n\nmodel_names = (\n    \"Independent\",\n    \"Chain 1\",\n    \"Chain 2\",\n    \"Chain 3\",\n    \"Chain 4\",\n    \"Chain 5\",\n    \"Chain 6\",\n    \"Chain 7\",\n    \"Chain 8\",\n    \"Chain 9\",\n    \"Chain 10\",\n    \"Ensemble\",\n)\n\nx_pos = np.arange(len(model_names))\n\nfig, ax = plt.subplots(figsize=(7, 4))\nax.grid(True)\nax.set_title(\"Classifier Chain Ensemble Performance Comparison\")\nax.set_xticks(x_pos)\nax.set_xticklabels(model_names, rotation=\"vertical\")\nax.set_ylabel(\"Jaccard Similarity Score\")\nax.set_ylim([min(model_scores) * 0.9, max(model_scores) * 1.1])\ncolors = [\"r\"] + [\"b\"] * len(chain_jaccard_scores) + [\"g\"]\nax.bar(x_pos, model_scores, alpha=0.5, color=colors)\nplt.tight_layout()\nplt.show()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "## Results interpretation\nThere are three main takeaways from this plot:\n\n- Independent model wrapped by :class:`~sklearn.multiclass.OneVsRestClassifier`\n  performs worse than the ensemble of classifier chains and some of individual chains.\n  This is caused by the fact that the logistic regression doesn't model relationship\n  between the labels.\n- :class:`~sklearn.multioutput.ClassifierChain` takes advantage of correlation\n  among labels but due to random nature of labels ordering, it could yield worse\n  result than an independent model.\n- An ensemble of chains performs better because it not only captures relationship\n  between labels but also does not make strong assumptions about their correct order.\n\n"
       ]
     }
   ],
 
@@ -1,71 +1,94 @@
 """
-============================
-Classifier Chain
-============================
-Example of using classifier chain on a multilabel dataset.
-
-For this example we will use the `yeast
-<https://www.openml.org/d/40597>`_ dataset which contains
-2417 datapoints each with 103 features and 14 possible labels. Each
-data point has at least one label. As a baseline we first train a logistic
-regression classifier for each of the 14 labels. To evaluate the performance of
-these classifiers we predict on a held-out test set and calculate the
-:ref:`jaccard score <jaccard_similarity_score>` for each sample.
-
-Next we create 10 classifier chains. Each classifier chain contains a
-logistic regression model for each of the 14 labels. The models in each
-chain are ordered randomly. In addition to the 103 features in the dataset,
-each model gets the predictions of the preceding models in the chain as
-features (note that by default at training time each model gets the true
-labels as features). These additional features allow each chain to exploit
-correlations among the classes. The Jaccard similarity score for each chain
-tends to be greater than that of the set independent logistic models.
-
-Because the models in each chain are arranged randomly there is significant
-variation in performance among the chains. Presumably there is an optimal
-ordering of the classes in a chain that will yield the best performance.
-However we do not know that ordering a priori. Instead we can construct an
-voting ensemble of classifier chains by averaging the binary predictions of
-the chains and apply a threshold of 0.5. The Jaccard similarity score of the
-ensemble is greater than that of the independent models and tends to exceed
-the score of each chain in the ensemble (although this is not guaranteed
-with randomly ordered chains).
-
+==================================================
+Multilabel classification using a classifier chain
+==================================================
+This example shows how to use :class:`~sklearn.multioutput.ClassifierChain` to solve
+a multilabel classification problem.
+
+The most naive strategy to solve such a task is to independently train a binary
+classifier on each label (i.e. each column of the target variable). At prediction
+time, the ensemble of binary classifiers is used to assemble multitask prediction.
+
+This strategy does not allow to model relationship between different tasks. The
+:class:`~sklearn.multioutput.ClassifierChain` is the meta-estimator (i.e. an estimator
+taking an inner estimator) that implements a more advanced strategy. The ensemble
+of binary classifiers are used as a chain where the prediction of a classifier in the
+chain is used as a feature for training the next classifier on a new label. Therefore,
+these additional features allow each chain to exploit correlations among labels.
+
+The :ref:`Jaccard similarity <jaccard_similarity_score>` score for chain tends to be
+greater than that of the set independent base models.
 """
 
 # Author: Adam Kleczewski
 # License: BSD 3 clause
 
+# %%
+# Loading a dataset
+# -----------------
+# For this example, we use the `yeast
+# <https://www.openml.org/d/40597>`_ dataset which contains
+# 2,417 datapoints each with 103 features and 14 possible labels. Each
+# data point has at least one label. As a baseline we first train a logistic
+# regression classifier for each of the 14 labels. To evaluate the performance of
+# these classifiers we predict on a held-out test set and calculate the
+# Jaccard similarity for each sample.
+
 import matplotlib.pyplot as plt
 import numpy as np
 
 from sklearn.datasets import fetch_openml
-from sklearn.linear_model import LogisticRegression
-from sklearn.metrics import jaccard_score
 from sklearn.model_selection import train_test_split
-from sklearn.multiclass import OneVsRestClassifier
-from sklearn.multioutput import ClassifierChain
 
 # Load a multi-label dataset from https://www.openml.org/d/40597
 X, Y = fetch_openml("yeast", version=4, return_X_y=True, parser="pandas")
 Y = Y == "TRUE"
 X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
 
-# Fit an independent logistic regression model for each class using the
-# OneVsRestClassifier wrapper.
+# %%
+# Fit models
+# ----------
+# We fit :class:`~sklearn.linear_model.LogisticRegression` wrapped by
+# :class:`~sklearn.multiclass.OneVsRestClassifier` and ensemble of multiple
+# :class:`~sklearn.multioutput.ClassifierChain`.
+#
+# LogisticRegression wrapped by OneVsRestClassifier
+# **************************************************
+# Since by default :class:`~sklearn.linear_model.LogisticRegression` can't
+# handle data with multiple targets, we need to use
+# :class:`~sklearn.multiclass.OneVsRestClassifier`.
+# After fitting the model we calculate Jaccard similarity.
+
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import jaccard_score
+from sklearn.multiclass import OneVsRestClassifier
+
 base_lr = LogisticRegression()
 ovr = OneVsRestClassifier(base_lr)
 ovr.fit(X_train, Y_train)
 Y_pred_ovr = ovr.predict(X_test)
 ovr_jaccard_score = jaccard_score(Y_test, Y_pred_ovr, average="samples")
 
-# Fit an ensemble of logistic regression classifier chains and take the
-# take the average prediction of all the chains.
+# %%
+# Chain of binary classifiers
+# ***************************
+# Because the models in each chain are arranged randomly there is significant
+# variation in performance among the chains. Presumably there is an optimal
+# ordering of the classes in a chain that will yield the best performance.
+# However, we do not know that ordering a priori. Instead, we can build a
+# voting ensemble of classifier chains by averaging the binary predictions of
+# the chains and apply a threshold of 0.5. The Jaccard similarity score of the
+# ensemble is greater than that of the independent models and tends to exceed
+# the score of each chain in the ensemble (although this is not guaranteed
+# with randomly ordered chains).
+
+from sklearn.multioutput import ClassifierChain
+
 chains = [ClassifierChain(base_lr, order="random", random_state=i) for i in range(10)]
 for chain in chains:
     chain.fit(X_train, Y_train)
 
-Y_pred_chains = np.array([chain.predict(X_test) for chain in chains])
+Y_pred_chains = np.array([chain.predict_proba(X_test) for chain in chains])
 chain_jaccard_scores = [
     jaccard_score(Y_test, Y_pred_chain >= 0.5, average="samples")
     for Y_pred_chain in Y_pred_chains
@@ -76,8 +99,14 @@
     Y_test, Y_pred_ensemble >= 0.5, average="samples"
 )
 
-model_scores = [ovr_jaccard_score] + chain_jaccard_scores
-model_scores.append(ensemble_jaccard_score)
+# %%
+# Plot results
+# ------------
+# Plot the Jaccard similarity scores for the independent model, each of the
+# chains, and the ensemble (note that the vertical axis on this plot does
+# not begin at 0).
+
+model_scores = [ovr_jaccard_score] + chain_jaccard_scores + [ensemble_jaccard_score]
 
 model_names = (
     "Independent",
@@ -96,10 +125,6 @@
 
 x_pos = np.arange(len(model_names))
 
-# Plot the Jaccard similarity scores for the independent model, each of the
-# chains, and the ensemble (note that the vertical axis on this plot does
-# not begin at 0).
-
 fig, ax = plt.subplots(figsize=(7, 4))
 ax.grid(True)
 ax.set_title("Classifier Chain Ensemble Performance Comparison")
@@ -111,3 +136,18 @@
 ax.bar(x_pos, model_scores, alpha=0.5, color=colors)
 plt.tight_layout()
 plt.show()
+
+# %%
+# Results interpretation
+# ----------------------
+# There are three main takeaways from this plot:
+#
+# - Independent model wrapped by :class:`~sklearn.multiclass.OneVsRestClassifier`
+#   performs worse than the ensemble of classifier chains and some of individual chains.
+#   This is caused by the fact that the logistic regression doesn't model relationship
+#   between the labels.
+# - :class:`~sklearn.multioutput.ClassifierChain` takes advantage of correlation
+#   among labels but due to random nature of labels ordering, it could yield worse
+#   result than an independent model.
+# - An ensemble of chains performs better because it not only captures relationship
+#   between labels but also does not make strong assumptions about their correct order.