diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst index 04528e7044ad6..c3fbb25e7d015 100644 --- a/doc/whats_new/v0.20.rst +++ b/doc/whats_new/v0.20.rst @@ -652,6 +652,10 @@ Linear, kernelized and related models :class:`linear_model.LogisticRegression` when ``verbose`` is set to 0. :issue:`10881` by :user:`Alexandre Sevin `. +- In both :class:`ensemble.RandomForestClassifier` and `ensemble.ExtraTreesClassifier`, + the default `n_estimators` parameter value has been changed to 100. A deprecation warning + is now raised when `n_estimators=10`. :issue:`11128` by :user:`Olamilekan Wahab ` + Preprocessing - Deprecate ``n_values`` and ``categorical_features`` parameters and diff --git a/poldata.README.2.0 b/poldata.README.2.0 new file mode 100644 index 0000000000000..0f0a19731f6dd --- /dev/null +++ b/poldata.README.2.0 @@ -0,0 +1,118 @@ + +======= + +Introduction + +This README v2.0 (June, 2004) for the v2.0 polarity dataset comes from +the URL http://www.cs.cornell.edu/people/pabo/movie-review-data . + +======= + +What's New -- June, 2004 + +This dataset represents an enhancement of the review corpus v1.0 +described in README v1.1: it contains more reviews, and labels were +created with an improved rating-extraction system. + +======= + +Citation Info + +This data was first used in Bo Pang and Lillian Lee, +``A Sentimental Education: Sentiment Analysis Using Subjectivity Summarization +Based on Minimum Cuts'', Proceedings of the ACL, 2004. + +@InProceedings{Pang+Lee:04a, + author = {Bo Pang and Lillian Lee}, + title = {A Sentimental Education: Sentiment Analysis Using Subjectivity Summarization Based on Minimum Cuts}, + booktitle = "Proceedings of the ACL", + year = 2004 +} + +======= + +Data Format Summary + +- review_polarity.tar.gz: contains this readme and data used in + the experiments described in Pang/Lee ACL 2004. + + Specifically: + + Within the folder "txt_sentoken" are the 2000 processed down-cased + text files used in Pang/Lee ACL 2004; the names of the two + subdirectories in that folder, "pos" and "neg", indicate the true + classification (sentiment) of the component files according to our + automatic rating classifier (see section "Rating Decision" below). + + File names consist of a cross-validation tag plus the name of the + original html file. The ten folds used in the Pang/Lee ACL 2004 paper's + experiments were: + + fold 1: files tagged cv000 through cv099, in numerical order + fold 2: files tagged cv100 through cv199, in numerical order + ... + fold 10: files tagged cv900 through cv999, in numerical order + + Hence, the file neg/cv114_19501.txt, for example, was labeled as + negative, served as a member of fold 2, and was extracted from the + file 19501.html in polarity_html.zip (see below). + + Each line in each text file corresponds to a single sentence, as + determined by Adwait Ratnaparkhi's sentence boundary detector + MXTERMINATOR. + + Preliminary steps were taken to remove rating information from the + text files, but only the rating information upon which the rating + decision was based is guaranteed to have been removed. Thus, if the + original review contains several instances of rating information, + potentially given in different forms, those not recognized as valid + ratings remain part of the review text. + +- polarity_html.zip: The original source files from which the + processed, labeled, and (randomly) selected data in + review_polarity.tar.gz was derived. + + Specifically: + + This data consists of unprocessed, unlabeled html files from the + IMDb archive of the rec.arts.movies.reviews newsgroup, + http://reviews.imdb.com/Reviews. The files in review_polarity.tar.gz + represent a processed subset of these files. + +======= + +Rating Decision (Appendix A) + +This section describes how we determined whether a review was positive +or negative. + +The original html files do not have consistent formats -- a review may +not have the author's rating with it, and when it does, the rating can +appear at different places in the file in different forms. We only +recognize some of the more explicit ratings, which are extracted via a +set of ad-hoc rules. In essence, a file's classification is determined +based on the first rating we were able to identify. + + +- In order to obtain more accurate rating decisions, the maximum + rating must be specified explicitly, both for numerical ratings + and star ratings. ("8/10", "four out of five", and "OUT OF + ****: ***" are examples of rating indications we recognize.) + +- With a five-star system (or compatible number systems): + three-and-a-half stars and up are considered positive, + two stars and below are considered negative. +- With a four-star system (or compatible number system): + three stars and up are considered positive, + one-and-a-half stars and below are considered negative. +- With a letter grade system: + B or above is considered positive, + C- or below is considered negative. + +We attempted to recognize half stars, but they are specified in an +especially free way, which makes them difficult to recognize. Hence, +we may lose a half star very occasionally; but this only results in 2.5 +stars in five star system being categorized as negative, which is +still reasonable. + + diff --git a/sklearn/ensemble/forest.py b/sklearn/ensemble/forest.py index b7a349d4b5a89..8e4b045c791c4 100644 --- a/sklearn/ensemble/forest.py +++ b/sklearn/ensemble/forest.py @@ -242,6 +242,12 @@ def fit(self, X, y, sample_weight=None): ------- self : object """ + # Validate hyperparameters + if self.n_estimators is None: + warnings.warn("'n_estimators' default value will be changed to 100" + "in version 0.22.", FutureWarning) + self.n_estimators = 10 + # Validate or convert input data X = check_array(X, accept_sparse="csc", dtype=DTYPE) y = check_array(y, accept_sparse='csc', ensure_2d=False, dtype=None) diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py index 0054707ba5a0d..40fcaf4f239d5 100644 --- a/sklearn/ensemble/tests/test_forest.py +++ b/sklearn/ensemble/tests/test_forest.py @@ -21,7 +21,7 @@ import pytest -from sklearn.utils.testing import assert_almost_equal +from sklearn.utils.testing import assert_almost_equal, assert_no_warnings from sklearn.utils.testing import assert_array_almost_equal from sklearn.utils.testing import assert_array_equal from sklearn.utils.testing import assert_equal @@ -204,6 +204,30 @@ def check_probability(name): np.exp(clf.predict_log_proba(iris.data))) +@pytest.mark.parametrize('name', FOREST_CLASSIFIERS) +def test_no_warning_raised_with_deprecated_n_estimator(name): + ForestClassifier = FOREST_CLASSIFIERS[name] + clf = ForestClassifier(n_estimators=10, random_state=1, max_features=1, + max_depth=1) + assert_no_warnings(func=clf.fit, X=iris.data, y=iris.target) + + +@pytest.mark.parametrize('name', FOREST_CLASSIFIERS) +def test_no_warning_raised_with_correct_n_estimator(name): + ForestClassifier = FOREST_CLASSIFIERS[name] + clf = ForestClassifier(n_estimators=100, random_state=1, max_features=1, + max_depth=1) + assert_no_warnings(func=clf.fit, X=iris.data, y=iris.target) + + +@pytest.mark.parametrize('name', FOREST_CLASSIFIERS) +def test_raised_warning_with_n_estimator_as_none(name): + ForestClassifier = FOREST_CLASSIFIERS[name] + clf = ForestClassifier(n_estimators=None, random_state=1, max_features=1, + max_depth=1) + assert_warns(FutureWarning, func=clf.fit, X=iris.data, y=iris.target) + + @pytest.mark.parametrize('name', FOREST_CLASSIFIERS) def test_probability(name): check_probability(name)