diff --git a/.all-contributorsrc b/.all-contributorsrc new file mode 100644 index 0000000..f553bef --- /dev/null +++ b/.all-contributorsrc @@ -0,0 +1,143 @@ +{ + "projectName": "Deep-Forest", + "projectOwner": "LAMDA-NJU", + "repoType": "github", + "repoHost": "https://github.com", + "files": [ + "CONTRIBUTORS.md" + ], + "imageSize": 100, + "commit": false, + "commitConvention": "none", + "contributorsPerLine": 7, + "contributorsSortAlphabetically": true, + "skipCi": true + "contributors": [ + { + "login": "xuyxu", + "name": "Yi-Xuan Xu", + "avatar_url": "https://avatars.githubusercontent.com/u/22359569?v=4", + "profile": "https://github.com/xuyxu", + "contributions": [ + "code", + "doc", + "test" + ] + }, + { + "login": "tczhao", + "name": "tczhao", + "avatar_url": "https://avatars.githubusercontent.com/u/20961507?v=4", + "profile": "https://www.linkedin.com/in/tczhao/", + "contributions": [ + "code", + "doc", + "test" + ] + }, + { + "login": "NiMaZi", + "name": "NiMaZi", + "avatar_url": "https://avatars.githubusercontent.com/u/19431549?v=4", + "profile": "https://github.com/NiMaZi", + "contributions": [ + "code", + "test" + ] + }, + { + "login": "pjgao", + "name": "Joey Gao", + "avatar_url": "https://avatars.githubusercontent.com/u/22350313?v=4", + "profile": "https://github.com/pjgao", + "contributions": [ + "code" + ] + }, + { + "login": "dwaipayan05", + "name": "Dwaipayan Munshi", + "avatar_url": "https://avatars.githubusercontent.com/u/53687927?v=4", + "profile": "https://github.com/dwaipayan05", + "contributions": [ + "doc" + ] + }, + { + "login": "Mr-memorandum", + "name": "Mr-memorandum", + "avatar_url": "https://avatars.githubusercontent.com/u/33889145?v=4", + "profile": "https://github.com/Mr-memorandum", + "contributions": [ + "bug" + ] + }, + { + "login": "T-Allen-sudo", + "name": "T-Allen-sudo", + "avatar_url": "https://avatars.githubusercontent.com/u/65913092?v=4", + "profile": "https://github.com/T-Allen-sudo", + "contributions": [ + "maintenance", + "test" + ] + }, + { + "login": "zhenlingcn", + "name": "zhenlingcn", + "avatar_url": "https://avatars.githubusercontent.com/u/18747119?v=4", + "profile": "https://github.com/zhenlingcn", + "contributions": [ + "bug" + ] + }, + { + "login": "Alex-Medium", + "name": "Alex-Medium", + "avatar_url": "https://avatars.githubusercontent.com/u/78067955?v=4", + "profile": "http://alex-medium.github.io", + "contributions": [ + "code", + "test" + ] + }, + { + "login": "chendingyan", + "name": "ιιΌε½¦", + "avatar_url": "https://avatars.githubusercontent.com/u/16874978?v=4", + "profile": "https://github.com/chendingyan", + "contributions": [ + "bug", + "code" + ] + }, + { + "login": "zzzzwj", + "name": "Wenjie Zhang", + "avatar_url": "https://avatars.githubusercontent.com/u/23235538?v=4", + "profile": "https://github.com/zzzzwj", + "contributions": [ + "code", + "test" + ] + }, + { + "login": "zshgostop", + "name": "zshgostop", + "avatar_url": "https://avatars.githubusercontent.com/u/48615178?v=4", + "profile": "https://github.com/zshgostop", + "contributions": [ + "bug" + ] + }, + { + "login": "IncubatorShokuhou", + "name": "Hao Lyu", + "avatar_url": "https://avatars.githubusercontent.com/u/20434183?v=4", + "profile": "https://github.com/IncubatorShokuhou", + "contributions": [ + "code" + ] + } + ], +} diff --git a/.codecov.yml b/.codecov.yml index 6eeb835..55c238a 100644 --- a/.codecov.yml +++ b/.codecov.yml @@ -14,4 +14,4 @@ comment: false # enable codecov to report to GitHub status checks github_checks: - annotations: false \ No newline at end of file + annotations: true \ No newline at end of file diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..8f8f553 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,3 @@ +*.pyx linguist-language=Cython +*.pxd linguist-language=Cython +*.rst linguist-language=reStructuredText \ No newline at end of file diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index 0af89b2..9757219 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -7,12 +7,12 @@ on: branches: [ master ] jobs: - build: + build: runs-on: ${{ matrix.os }} strategy: matrix: - os: [ubuntu-latest, windows-latest] - python-version: [3.6, 3.7, 3.8] + os: [ubuntu-latest, windows-latest, macos-latest] + python-version: [3.7, 3.8, 3.9] steps: - uses: actions/checkout@v2 - name: Set up Python @@ -21,10 +21,12 @@ jobs: python-version: ${{ matrix.python-version }} - name: Display python version run: python -c "import sys; print(sys.version)" + - name: Install libomp for Mac-OS + if: runner.os == 'macOS' + run: brew install libomp - name: Install package dependencies run: | python -m pip install --upgrade pip - pip install -r requirements.txt pip install -r build_tools/requirements.txt - name: Install run: pip install --verbose --editable . diff --git a/.github/workflows/build-wheels.yml b/.github/workflows/build-wheels.yml index 3dc9674..14a8ff7 100644 --- a/.github/workflows/build-wheels.yml +++ b/.github/workflows/build-wheels.yml @@ -12,20 +12,26 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - os: [ubuntu-latest, windows-latest] - python-version: [3.6, 3.7, 3.8] + os: [ubuntu-latest, windows-latest, macos-latest] steps: - uses: actions/checkout@v2 - - name: Set up Python - uses: actions/setup-python@v2 + + - name: Set up QEMU + if: runner.os == 'Linux' + uses: docker/setup-qemu-action@v1 with: - python-version: ${{ matrix.python-version }} - - name: Install cibuildwheel - run: python -m pip install cibuildwheel==1.8.0 + platforms: all + - name: Build wheels - run: python -m cibuildwheel --output-dir wheelhouse + uses: joerick/cibuildwheel@v1.9.0 + with: + output-dir: wheelhouse env: - CIBW_BUILD: "cp36-manylinux_x86_64 cp37-manylinux_x86_64 cp38-manylinux_x86_64 cp36-win_amd64 cp37-win_amd64 cp38-win_amd64" + CIBW_ARCHS_LINUX: "x86_64 aarch64" + CIBW_ARCHS_WINDOWS: "AMD64" + CIBW_ARCHS_MACOS: "x86_64" + CIBW_BUILD: cp3*-macosx_x86_64 cp3*-win_amd64 cp3*-manylinux_x86_64 cp3*-manylinux_aarch64 + CIBW_SKIP: cp35-* cp36-* - name: Store artifacts uses: actions/upload-artifact@v2 with: diff --git a/.github/workflows/code-quality.yml b/.github/workflows/code-quality.yml index 1ce6e44..3676b41 100644 --- a/.github/workflows/code-quality.yml +++ b/.github/workflows/code-quality.yml @@ -27,5 +27,6 @@ jobs: pip install -r build_tools/requirements.txt - name: Check code quality run: | + black --skip-string-normalization --check --config pyproject.toml ./ chmod +x "${GITHUB_WORKSPACE}/build_tools/linting.sh" ./build_tools/linting.sh diff --git a/.gitignore b/.gitignore index 2f5060e..e41905c 100644 --- a/.gitignore +++ b/.gitignore @@ -11,8 +11,9 @@ *.html *.xml .coverage - .DS_Store +package-lock.json +node_modules/ .idea/ .vscode/ .pytest_cache/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..ffcc612 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,10 @@ +repos: +- repo: https://github.com/ambv/black + rev: 20.8b1 + hooks: + - id: black + language_version: python3 +- repo: https://gitlab.com/pycqa/flake8 + rev: 3.8.4 + hooks: + - id: flake8 diff --git a/.readthedocs.yml b/.readthedocs.yml new file mode 100644 index 0000000..566f168 --- /dev/null +++ b/.readthedocs.yml @@ -0,0 +1,14 @@ +version: 2 + +formats: all + +sphinx: + configuration: docs/conf.py + +python: + version: 3.7 + install: + - requirements: build_tools/requirements.txt + - method: pip + path: . + - requirements: docs/requirements.txt diff --git a/CHANGELOG.rst b/CHANGELOG.rst index d8fab9f..3af9ee1 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -24,11 +24,36 @@ Version 0.1.* .. role:: raw-latex(raw) :format: latex -.. |MajorFeature| replace:: :raw-html:`Major Feature` :raw-latex:`{\small\sc [Major Feature]}` .. |Feature| replace:: :raw-html:`Feature` :raw-latex:`{\small\sc [Feature]}` .. |Efficiency| replace:: :raw-html:`Efficiency` :raw-latex:`{\small\sc [Efficiency]}` -.. |Enhancement| replace:: :raw-html:`Enhancement` :raw-latex:`{\small\sc [Enhancement]}` +.. |Enhancement| replace:: :raw-html:`Enhancement` :raw-latex:`{\small\sc [Enhancement]}` .. |Fix| replace:: :raw-html:`Fix` :raw-latex:`{\small\sc [Fix]}` .. |API| replace:: :raw-html:`API Change` :raw-latex:`{\small\sc [API Change]}` -- |Feature| configurable predictor parameter `#9 `__ \ No newline at end of file +- |Feature| support the latest version of scikit-learn and drop support on python 3.6 (`#115 `__) @xuyxu +- |Feature| |API| add support on :obj:`pandas.DataFrame` for ``X`` and ``y`` (`#86 `__) @IncubatorShokuhou +- |Fix| fix missing functionality of :meth:`_set_n_trees` @xuyxu +- |Fix| |API| add docstrings for parameter ``bin_type`` (`#74 `__) @xuyxu +- |Feature| |API| recover the parameter ``min_samples_split`` (`#73 `__) @xuyxu +- |Fix| fix the breakdown under the corner case where no internal node exists (`#70 `__) @xuyxu +- |Feature| support python 3.9 (`#69 `__) @xuyxu +- |Fix| fix inconsistency on array shape for :obj:`CascadeForestRegressor` in customized mode (`#67 `__) @xuyxu +- |Fix| fix missing sample indices for parameter ``sample_weight`` in :obj:`KFoldWrapper` (`#48 `__) @xuyxu +- |Feature| |API| add support on customized estimators (`#48 `__) @xuyxu +- |Enhancement| improve target checks for :obj:`CascadeForestRegressor` (`#53 `__) @chendingyan +- |Fix| fix the prediction workflow with only one cascade layer (`#56 `__) @xuyxu +- |Fix| fix inconsistency on predictor name (`#52 `__) @xuyxu +- |Feature| add official support for ManyLinux-aarch64 (`#47 `__) @xuyxu +- |Fix| fix accepted types of target for :obj:`CascadeForestRegressor` (`#44 `__) @xuyxu +- |Feature| |API| add multi-output support for :obj:`CascadeForestRegressor` (`#40 `__) @Alex-Medium +- |Feature| |API| add layer-wise feature importances (`#39 `__) @xuyxu +- |Feature| |API| add scikit-learn backend (`#36 `__) @xuyxu +- |Feature| add official support for Mac-OS (`#34 `__) @T-Allen-sudo +- |Feature| |API| support configurable criterion (`#28 `__) @tczhao +- |Feature| |API| support regression prediction (`#25 `__) @tczhao +- |Fix| fix accepted data types on the :obj:`binner` (`#23 `__) @xuyxu +- |Feature| |API| implement the :meth:`get_estimator` method for efficient indexing (`#22 `__) @xuyxu +- |Feature| support class label encoding (`#18 `__) @NiMaZi +- |Feature| |API| support sample weight in :meth:`fit` (`#7 `__) @tczhao +- |Feature| |API| configurable predictor parameter (`#9 `__) @tczhao +- |Enhancement| add base class ``BaseEstimator`` and ``ClassifierMixin`` (`#8 `__) @pjgao diff --git a/CONTRIBUTIONG.md b/CONTRIBUTIONG.md new file mode 100644 index 0000000..56d33f2 --- /dev/null +++ b/CONTRIBUTIONG.md @@ -0,0 +1,7 @@ +## Install requirements + +`python -m pip install --upgrade pip` +`pip install -r build_tools/requirements.txt` +`pre-commit install` + +## Add your change to CHANGELOG.rst \ No newline at end of file diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md new file mode 100644 index 0000000..d78a316 --- /dev/null +++ b/CONTRIBUTORS.md @@ -0,0 +1,34 @@ + +## Contributors β¨ + +Thanks goes to these wonderful people ([emoji key](https://allcontributors.org/docs/en/emoji-key)): + + + + + + + Alex-Mediumπ» β οΈ + Dwaipayan Munshiπ + Hao Lyuπ» + Joey Gaoπ» + Mr-memorandumπ + NiMaZiπ» β οΈ + T-Allen-sudoπ§ β οΈ + Wenjie Zhangπ» β οΈ + + + Yi-Xuan Xuπ» π β οΈ + tczhaoπ» π β οΈ + zhenlingcnπ + zshgostopπ + ιιΌε½¦π π» + + + + + + + + +This project follows the [all-contributors](https://github.com/all-contributors/all-contributors) specification. Contributions of any kind welcome! \ No newline at end of file diff --git a/LICENSE b/LICENSE index 4997f31..afb43bd 100644 --- a/LICENSE +++ b/LICENSE @@ -1,8 +1,8 @@ COPYRIGHT ========= -Copyright (c) 2021 LAMDA (http://www.lamda.nju.edu.cn), Nanjing University, China -All rights reserved. +Copyright (c) 2021-2023 LAMDA (http://www.lamda.nju.edu.cn), Nanjing +University, China All rights reserved. LICENSE ======= diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..90b37fa --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,7 @@ +include *.rst +recursive-include docs * +recursive-exclude tests * +include *.md +recursive-include deepforest *.py +recursive-include deepforest *.c *.h *.pyx *.pxd *.pxi +include LICENSE \ No newline at end of file diff --git a/README.rst b/README.rst index 7ff6912..bb3db5f 100644 --- a/README.rst +++ b/README.rst @@ -1,11 +1,14 @@ Deep Forest (DF) 21 =================== -|github|_ |codecov|_ |python|_ |pypi|_ +|github|_ |readthedocs|_ |codecov|_ |python|_ |pypi|_ |style|_ .. |github| image:: https://github.com/LAMDA-NJU/Deep-Forest/workflows/DeepForest-CI/badge.svg .. _github: https://github.com/LAMDA-NJU/Deep-Forest/actions +.. |readthedocs| image:: https://readthedocs.org/projects/deep-forest/badge/?version=latest +.. _readthedocs: https://deep-forest.readthedocs.io + .. |codecov| image:: https://codecov.io/gh/LAMDA-NJU/Deep-Forest/branch/master/graph/badge.svg?token=5BVXOT8RPO .. _codecov: https://codecov.io/gh/LAMDA-NJU/Deep-Forest @@ -15,21 +18,26 @@ Deep Forest (DF) 21 .. |pypi| image:: https://img.shields.io/pypi/v/deep-forest?color=blue .. _pypi: https://pypi.org/project/deep-forest/ -**DF21** is an implementation of Deep Forest 2021.2.1. It is designed to have the following advantages: +.. |style| image:: https://img.shields.io/badge/code%20style-black-000000.svg +.. _style: https://github.com/psf/black + +**DF21** is an implementation of `Deep Forest `__ 2021.2.1. It is designed to have the following advantages: - **Powerful**: Better accuracy than existing tree-based ensemble methods. - **Easy to Use**: Less efforts on tunning parameters. - **Efficient**: Fast training speed and high efficiency. - **Scalable**: Capable of handling large-scale data. -Whenever one used tree-based machine learning approaches such as Random Forest or GBDT, DF21 may offer a new powerful option. +DF21 offers an effective & powerful option to the tree-based machine learning algorithms such as Random Forest or GBDT. -For a quick start, please refer to `How to Get Started `__. For a detailed guidance on parameter tunning, please refer to `Parameters Tunning `__. +For a quick start, please refer to `How to Get Started `__. For a detailed guidance on parameter tunning, please refer to `Parameters Tunning `__. + +DF21 is optimized for what a tree-based ensemble excels at (i.e., tabular data), if you want to use the multi-grained scanning part to better handle structured data like images, please refer to the `origin implementation `__ for details. Installation ------------ -The package is available via PyPI using: +DF21 can be installed using pip via `PyPI `__ which is the package installer for Python. You can use pip to install packages from the Python Package Index and other indexes. Refer `this `__ for the documentation of pip. Use this command to download DF21 : .. code-block:: bash @@ -38,6 +46,9 @@ The package is available via PyPI using: Quickstart ---------- +Classification +************** + .. code-block:: python from sklearn.datasets import load_digits @@ -55,11 +66,31 @@ Quickstart print("\nTesting Accuracy: {:.3f} %".format(acc)) >>> Testing Accuracy: 98.667 % +Regression +********** + +.. code-block:: python + + from sklearn.datasets import load_boston + from sklearn.model_selection import train_test_split + from sklearn.metrics import mean_squared_error + + from deepforest import CascadeForestRegressor + + X, y = load_boston(return_X_y=True) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) + model = CascadeForestRegressor(random_state=1) + model.fit(X_train, y_train) + y_pred = model.predict(X_test) + mse = mean_squared_error(y_test, y_pred) + print("\nTesting MSE: {:.3f}".format(mse)) + >>> Testing MSE: 8.068 + Resources --------- -* `Documentation `__ -* Deep Forest: `[Paper] `__ +* `Documentation `__ +* Deep Forest: `[Conference] `__ | `[Journal] `__ * Keynote at AISTATS 2019: `[Slides] `__ Reference @@ -77,13 +108,16 @@ Reference year={2019}} @inproceedings{zhou2017deep, - Author = {Zhi-Hua Zhou and Ji Feng}, - Booktitle = {IJCAI}, - Pages = {3553-3559}, - Title = {{Deep Forest:} Towards an alternative to deep neural networks}, - Year = {2017}} + title = {{Deep Forest:} Towards an alternative to deep neural networks}, + author = {Zhi-Hua Zhou and Ji Feng}, + booktitle = {IJCAI}, + pages = {3553--3559}, + year = {2017}} + +Thanks to all our contributors +------------------------------ -Acknowledgement ---------------- +|contributors| -The lead developer and maintainer of DF21 is Mr. `Yi-Xuan Xu `__. Before the release, it has been used internally in the LAMDA Group, Nanjing University, China. +.. |contributors| image:: https://contributors-img.web.app/image?repo=LAMDA-NJU/Deep-Forest +.. _contributors: https://github.com/LAMDA-NJU/Deep-Forest/graphs/contributors diff --git a/build_tools/requirements.txt b/build_tools/requirements.txt index 0d6c909..6fb9f1a 100644 --- a/build_tools/requirements.txt +++ b/build_tools/requirements.txt @@ -1,5 +1,10 @@ -flake8 +-r ../requirements.txt +pytest +pre-commit +black==20.8b1 +click==8.0.3 +flake8==3.8.4 pytest-cov lightgbm xgboost -cython>=0.28.5 \ No newline at end of file +cython>=0.28.5 diff --git a/deepforest/__init__.py b/deepforest/__init__.py index ebbdabc..dcc150c 100644 --- a/deepforest/__init__.py +++ b/deepforest/__init__.py @@ -1,12 +1,19 @@ -from .cascade import CascadeForestClassifier -from .forest import RandomForestClassifier -from .forest import ExtraTreesClassifier -from .tree import DecisionTreeClassifier -from .tree import ExtraTreeClassifier +from .cascade import CascadeForestClassifier, CascadeForestRegressor +from .forest import RandomForestClassifier, RandomForestRegressor +from .forest import ExtraTreesClassifier, ExtraTreesRegressor +from .tree import DecisionTreeClassifier, DecisionTreeRegressor +from .tree import ExtraTreeClassifier, ExtraTreeRegressor -__all__ = ["CascadeForestClassifier", - "RandomForestClassifier", - "ExtraTreesClassifier", - "DecisionTreeClassifier", - "ExtraTreeClassifier"] +__all__ = [ + "CascadeForestClassifier", + "CascadeForestRegressor", + "RandomForestClassifier", + "RandomForestRegressor", + "ExtraTreesClassifier", + "ExtraTreesRegressor", + "DecisionTreeClassifier", + "DecisionTreeRegressor", + "ExtraTreeClassifier", + "ExtraTreeRegressor", +] diff --git a/deepforest/_binner.py b/deepforest/_binner.py index 220c448..35fd797 100644 --- a/deepforest/_binner.py +++ b/deepforest/_binner.py @@ -9,8 +9,8 @@ __all__ = ["Binner"] import numpy as np -from sklearn.utils import check_random_state from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.utils import check_random_state, check_array from . import _cutils as _LIB @@ -21,13 +21,14 @@ def _find_binning_thresholds_per_feature( - col_data, n_bins, bin_type="percentile" + col_data, n_bins, bin_type="percentile" ): """ Private function used to find midpoints for samples along a specific feature. """ if len(col_data.shape) != 1: + msg = ( "Per-feature data should be of the shape (n_samples,), but" " got {}-dims instead." @@ -72,17 +73,13 @@ def _find_binning_thresholds( rng = check_random_state(random_state) if n_samples > bin_subsample: - subset = rng.choice( - np.arange(n_samples), bin_subsample, replace=False - ) + subset = rng.choice(np.arange(n_samples), bin_subsample, replace=False) X = X.take(subset, axis=0) binning_thresholds = [] for f_idx in range(n_features): threshold = _find_binning_thresholds_per_feature( - X[:, f_idx], - n_bins, - bin_type + X[:, f_idx], n_bins, bin_type ) binning_thresholds.append(threshold) @@ -90,13 +87,12 @@ def _find_binning_thresholds( class Binner(TransformerMixin, BaseEstimator): - def __init__( self, n_bins=255, bin_subsample=2e5, bin_type="percentile", - random_state=None + random_state=None, ): self.n_bins = n_bins + 1 # + 1 for missing values self.bin_subsample = int(bin_subsample) @@ -107,8 +103,10 @@ def __init__( def _validate_params(self): if not 2 <= self.n_bins - 1 <= 255: - msg = ("`n_bins` should be in the range [2, 255], bug got" - " {} instead.") + msg = ( + "`n_bins` should be in the range [2, 255], bug got" + " {} instead." + ) raise ValueError(msg.format(self.n_bins - 1)) if not self.bin_subsample > 0: @@ -119,8 +117,10 @@ def _validate_params(self): raise ValueError(msg.format(self.bin_subsample)) if self.bin_type not in ("percentile", "interval"): - msg = ("The type of binner should be one of {{percentile, interval" - "}}, bug got {} instead.") + msg = ( + "The type of binner should be one of {{percentile, interval" + "}}, bug got {} instead." + ) raise ValueError(msg.format(self.bin_type)) def fit(self, X): @@ -162,7 +162,9 @@ def transform(self, X): msg.format(self.n_bins_non_missing_.shape[0], X.shape[1]) ) + X = check_array(X, dtype=X_DTYPE, force_all_finite=False) X_binned = np.zeros_like(X, dtype=X_BINNED_DTYPE, order="F") + _LIB._map_to_bins( X, self.bin_thresholds_, self.missing_values_bin_idx_, X_binned ) diff --git a/deepforest/_cutils.pyx b/deepforest/_cutils.pyx index 7cc0b12..398e437 100644 --- a/deepforest/_cutils.pyx +++ b/deepforest/_cutils.pyx @@ -74,11 +74,11 @@ cpdef _map_to_bins(object X, """ cdef: const X_DTYPE_C[:, :] X_ndarray = X - SIZE_t n_features = X_ndarray.shape[1] + SIZE_t n_features = X.shape[1] SIZE_t feature_idx for feature_idx in range(n_features): - _map_num_col_to_bins(X[:, feature_idx], + _map_num_col_to_bins(X_ndarray[:, feature_idx], binning_thresholds[feature_idx], missing_values_bin_idx, binned[:, feature_idx]) diff --git a/deepforest/_estimator.py b/deepforest/_estimator.py index 66d275d..c8ce2dc 100644 --- a/deepforest/_estimator.py +++ b/deepforest/_estimator.py @@ -3,35 +3,80 @@ __all__ = ["Estimator"] -from .forest import RandomForestClassifier, ExtraTreesClassifier +import numpy as np +from .forest import ( + RandomForestClassifier, + ExtraTreesClassifier, + RandomForestRegressor, + ExtraTreesRegressor, +) +from sklearn.ensemble import ( + RandomForestClassifier as sklearn_RandomForestClassifier, + ExtraTreesClassifier as sklearn_ExtraTreesClassifier, + RandomForestRegressor as sklearn_RandomForestRegressor, + ExtraTreesRegressor as sklearn_ExtraTreesRegressor, +) -def make_estimator( +def make_classifier_estimator( name, + criterion, n_trees=100, max_depth=None, + min_samples_split=2, min_samples_leaf=1, + backend="custom", n_jobs=None, - random_state=None + random_state=None, ): # RandomForestClassifier if name == "rf": - estimator = RandomForestClassifier( - n_estimators=n_trees, - max_depth=max_depth, - min_samples_leaf=min_samples_leaf, - n_jobs=n_jobs, - random_state=random_state, - ) + if backend == "custom": + estimator = RandomForestClassifier( + criterion=criterion, + n_estimators=n_trees, + max_depth=max_depth, + min_samples_split=min_samples_split, + min_samples_leaf=min_samples_leaf, + n_jobs=n_jobs, + random_state=random_state, + ) + elif backend == "sklearn": + estimator = sklearn_RandomForestClassifier( + criterion=criterion, + n_estimators=n_trees, + max_depth=max_depth, + min_samples_split=min_samples_split, + min_samples_leaf=min_samples_leaf, + bootstrap=True, + oob_score=True, + n_jobs=n_jobs, + random_state=random_state, + ) # ExtraTreesClassifier elif name == "erf": - estimator = ExtraTreesClassifier( - n_estimators=n_trees, - max_depth=max_depth, - min_samples_leaf=min_samples_leaf, - n_jobs=n_jobs, - random_state=random_state - ) + if backend == "custom": + estimator = ExtraTreesClassifier( + criterion=criterion, + n_estimators=n_trees, + max_depth=max_depth, + min_samples_split=min_samples_split, + min_samples_leaf=min_samples_leaf, + n_jobs=n_jobs, + random_state=random_state, + ) + elif backend == "sklearn": + estimator = sklearn_ExtraTreesClassifier( + criterion=criterion, + n_estimators=n_trees, + max_depth=max_depth, + min_samples_split=min_samples_split, + min_samples_leaf=min_samples_leaf, + bootstrap=True, + oob_score=True, + n_jobs=n_jobs, + random_state=random_state, + ) else: msg = "Unknown type of estimator, which should be one of {{rf, erf}}." raise NotImplementedError(msg) @@ -39,37 +84,142 @@ def make_estimator( return estimator -class Estimator(object): +def make_regressor_estimator( + name, + criterion, + n_trees=100, + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + backend="custom", + n_jobs=None, + random_state=None, +): + # RandomForestRegressor + if name == "rf": + if backend == "custom": + estimator = RandomForestRegressor( + criterion=criterion, + n_estimators=n_trees, + max_depth=max_depth, + min_samples_split=min_samples_split, + min_samples_leaf=min_samples_leaf, + n_jobs=n_jobs, + random_state=random_state, + ) + elif backend == "sklearn": + estimator = sklearn_RandomForestRegressor( + criterion=criterion, + n_estimators=n_trees, + max_depth=max_depth, + min_samples_split=min_samples_split, + min_samples_leaf=min_samples_leaf, + bootstrap=True, + oob_score=True, + n_jobs=n_jobs, + random_state=random_state, + ) + # ExtraTreesRegressor + elif name == "erf": + if backend == "custom": + estimator = ExtraTreesRegressor( + criterion=criterion, + n_estimators=n_trees, + max_depth=max_depth, + min_samples_split=min_samples_split, + min_samples_leaf=min_samples_leaf, + n_jobs=n_jobs, + random_state=random_state, + ) + elif backend == "sklearn": + estimator = sklearn_ExtraTreesRegressor( + criterion=criterion, + n_estimators=n_trees, + max_depth=max_depth, + min_samples_split=min_samples_split, + min_samples_leaf=min_samples_leaf, + bootstrap=True, + oob_score=True, + n_jobs=n_jobs, + random_state=random_state, + ) + else: + msg = "Unknown type of estimator, which should be one of {{rf, erf}}." + raise NotImplementedError(msg) + + return estimator + +class Estimator(object): def __init__( self, name, + criterion, n_trees=100, max_depth=None, + min_samples_split=2, min_samples_leaf=1, + backend="custom", n_jobs=None, - random_state=None + random_state=None, + is_classifier=True, ): - self.estimator_ = make_estimator(name, - n_trees, - max_depth, - min_samples_leaf, - n_jobs, - random_state) + + self.backend = backend + self.is_classifier = is_classifier + if self.is_classifier: + self.estimator_ = make_classifier_estimator( + name, + criterion, + n_trees, + max_depth, + min_samples_split, + min_samples_leaf, + backend, + n_jobs, + random_state, + ) + else: + self.estimator_ = make_regressor_estimator( + name, + criterion, + n_trees, + max_depth, + min_samples_split, + min_samples_leaf, + backend, + n_jobs, + random_state, + ) @property def oob_decision_function_(self): + # Scikit-Learn uses `oob_prediction_` for ForestRegressor + if self.backend == "sklearn" and not self.is_classifier: + oob_prediction = self.estimator_.oob_prediction_ + if len(oob_prediction.shape) == 1: + oob_prediction = np.expand_dims(oob_prediction, 1) + return oob_prediction return self.estimator_.oob_decision_function_ - def fit_transform(self, X, y): - self.estimator_.fit(X, y) - X_aug = self.estimator_.oob_decision_function_ + @property + def feature_importances_(self): + """Return the impurity-based feature importances from the estimator.""" - return X_aug + return self.estimator_.feature_importances_ - def transform(self, X): + def fit_transform(self, X, y, sample_weight=None): + self.estimator_.fit(X, y, sample_weight) + return self.oob_decision_function_ - return self.estimator_.predict_proba(X) + def transform(self, X): + """Preserved for the naming consistency.""" + return self.predict(X) def predict(self, X): - return self.estimator_.predict_proba(X) + if self.is_classifier: + return self.estimator_.predict_proba(X) + pred = self.estimator_.predict(X) + if len(pred.shape) == 1: + pred = np.expand_dims(pred, 1) + return pred diff --git a/deepforest/_forest.pyx b/deepforest/_forest.pyx index 335f824..a21938b 100644 --- a/deepforest/_forest.pyx +++ b/deepforest/_forest.pyx @@ -77,6 +77,7 @@ cdef void _apply_region(const DTYPE_t [:, :] data, """ cdef: SIZE_t n_samples = data.shape[0] + SIZE_t n_internal_nodes = feature.shape[0] SIZE_t i SIZE_t node_id SIZE_t node_feature @@ -87,6 +88,11 @@ cdef void _apply_region(const DTYPE_t [:, :] data, with nogil: for i in range(n_samples): + # Skip the corner case where the root node is a leaf node + if n_internal_nodes == 0: + out[i] = 0 + continue + node_id = 0 node_feature = feature[node_id] node_threshold = threshold[node_id] diff --git a/deepforest/_io.py b/deepforest/_io.py index 041d71a..c840f43 100644 --- a/deepforest/_io.py +++ b/deepforest/_io.py @@ -10,7 +10,7 @@ class is designed to support the partial mode in deep forest. import shutil import warnings import tempfile -from joblib import (load, dump) +from joblib import load, dump class Buffer(object): @@ -33,12 +33,15 @@ class Buffer(object): store_data : bool, default=False Whether to cache the intermediate data to the local buffer. """ - def __init__(self, - use_buffer, - buffer_dir=None, - store_est=True, - store_pred=True, - store_data=False): + + def __init__( + self, + use_buffer, + buffer_dir=None, + store_est=True, + store_pred=True, + store_data=False, + ): self.use_buffer = use_buffer self.store_est = store_est and use_buffer @@ -48,16 +51,19 @@ def __init__(self, # Create buffer if self.use_buffer: - self.buffer = tempfile.TemporaryDirectory(prefix="buffer_", - dir=self.buffer_dir) + self.buffer = tempfile.TemporaryDirectory( + prefix="buffer_", dir=self.buffer_dir + ) if store_data: - self.data_dir_ = tempfile.mkdtemp(prefix="data_", - dir=self.buffer.name) + self.data_dir_ = tempfile.mkdtemp( + prefix="data_", dir=self.buffer.name + ) if store_est or store_pred: - self.model_dir_ = tempfile.mkdtemp(prefix="model_", - dir=self.buffer.name) + self.model_dir_ = tempfile.mkdtemp( + prefix="model_", dir=self.buffer.name + ) self.pred_dir_ = os.path.join(self.model_dir_, "predictor.est") @property @@ -97,14 +103,16 @@ def cache_data(self, layer_idx, X, is_training_data=True): return X if is_training_data: - cache_dir = os.path.join(self.data_dir_, - "joblib_train_{}.mmap".format(layer_idx)) + cache_dir = os.path.join( + self.data_dir_, "joblib_train_{}.mmap".format(layer_idx) + ) # Delete if os.path.exists(cache_dir): os.unlink(cache_dir) else: - cache_dir = os.path.join(self.data_dir_, - "joblib_test_{}.mmap".format(layer_idx)) + cache_dir = os.path.join( + self.data_dir_, "joblib_test_{}.mmap".format(layer_idx) + ) # Delete if os.path.exists(cache_dir): os.unlink(cache_dir) @@ -132,7 +140,7 @@ def cache_estimator(self, layer_idx, est_idx, est_name, est): cached. est_idx : int The index of the estimator in the cascade layer to be cached. - est_name : {"rf", "erf"} + est_name : {"rf", "erf", "custom"} The name of the estimator to be cached. est : object The object of base estimator. @@ -209,8 +217,10 @@ def del_estimator(self, layer_idx): try: os.unlink(os.path.join(self.model_dir_, est_name)) except OSError: - msg = ("Permission denied when deleting the dumped" - " estimators during the early stopping stage.") + msg = ( + "Permission denied when deleting the dumped" + " estimators during the early stopping stage." + ) warnings.warn(msg, RuntimeWarning) def close(self): @@ -225,7 +235,7 @@ def close(self): def model_mkdir(dirname): """Make the directory for saving the model.""" if os.path.isdir(dirname): - msg = ("The directory to be created already exists {}.") + msg = "The directory to be created already exists {}." raise RuntimeError(msg.format(dirname)) os.mkdir(dirname) @@ -293,7 +303,11 @@ def model_loadobj(dirname, obj_type, d=None): obj = load(os.path.join(dirname, "{}.pkl".format(obj_type))) return obj elif obj_type == "layer": - from ._layer import Layer # avoid circular import + from ._layer import ( + ClassificationCascadeLayer, + RegressionCascadeLayer, + CustomCascadeLayer, + ) if not isinstance(d, dict): msg = "Loading layers requires the dict from `param.pkl`." @@ -305,33 +319,70 @@ def model_loadobj(dirname, obj_type, d=None): for layer_idx in range(n_layers): - # Build a temporary layer - layer_ = Layer( - layer_idx=layer_idx, - n_classes=d["n_outputs"], - n_estimators=d["n_estimators"], - partial_mode=d["partial_mode"], - buffer=d["buffer"], - verbose=d["verbose"] - ) - - for est_type in ("rf", "erf"): - for est_idx in range(n_estimators): - est_key = "{}-{}-{}".format( - layer_idx, est_idx, est_type + if not d["use_custom_estimator"]: + if d["is_classifier"]: + layer_ = ClassificationCascadeLayer( + layer_idx=layer_idx, + n_outputs=d["n_outputs"], + criterion=d["criterion"], + n_estimators=d["n_estimators"], + partial_mode=d["partial_mode"], + buffer=d["buffer"], + verbose=d["verbose"], ) - dest = os.path.join( - dirname, "estimator", est_key + ".est" + else: + layer_ = RegressionCascadeLayer( + layer_idx=layer_idx, + n_outputs=d["n_outputs"], + criterion=d["criterion"], + n_estimators=d["n_estimators"], + partial_mode=d["partial_mode"], + buffer=d["buffer"], + verbose=d["verbose"], ) + for est_type in ("rf", "erf"): + for est_idx in range(n_estimators): + est_key = "{}-{}-{}".format( + layer_idx, est_idx, est_type + ) + dest = os.path.join( + dirname, "estimator", est_key + ".est" + ) + + if not os.path.isfile(dest): + msg = "Missing estimator in the path: {}." + raise RuntimeError(msg.format(dest)) + + if d["partial_mode"]: + layer_.estimators_.update( + {est_key: os.path.abspath(dest)} + ) + else: + est = load(dest) + layer_.estimators_.update({est_key: est}) + else: + + layer_ = CustomCascadeLayer( + layer_idx=layer_idx, + n_splits=1, # will not be used + n_outputs=d["n_outputs"], + estimators=[None] * n_estimators, # will not be used + partial_mode=d["partial_mode"], + buffer=d["buffer"], + verbose=d["verbose"], + ) + + for est_idx in range(n_estimators): + est_key = "{}-{}-custom".format(layer_idx, est_idx) + dest = os.path.join(dirname, "estimator", est_key + ".est") + if not os.path.isfile(dest): msg = "Missing estimator in the path: {}." raise RuntimeError(msg.format(dest)) if d["partial_mode"]: - layer_.estimators_.update( - {est_key: os.path.abspath(dest)} - ) + layer_.estimators_.update({est_key: dest}) else: est = load(dest) layer_.estimators_.update({est_key: est}) @@ -348,13 +399,13 @@ def model_loadobj(dirname, obj_type, d=None): pred_path = os.path.join(dirname, "estimator", "predictor.est") if not os.path.isfile(pred_path): - msg = "Missing classifier in the path: {}." + msg = "Missing predictor in the path: {}." raise RuntimeError(msg.format(pred_path)) if d["partial_mode"]: return os.path.abspath(pred_path) else: - clf = load(pred_path) - return clf + predictor = load(pred_path) + return predictor else: raise ValueError("Unknown object type: {}.".format(obj_type)) diff --git a/deepforest/_layer.py b/deepforest/_layer.py index e94e00e..e0015fc 100644 --- a/deepforest/_layer.py +++ b/deepforest/_layer.py @@ -1,13 +1,21 @@ -"""Implementation of the forest-based cascade layer.""" +"""Implementation of the cascade layer in deep forest.""" -__all__ = ["Layer"] +__all__ = [ + "BaseCascadeLayer", + "ClassificationCascadeLayer", + "RegressionCascadeLayer", + "CustomCascadeLayer", +] import numpy as np -from sklearn.metrics import accuracy_score +from sklearn.base import is_classifier +from sklearn.metrics import accuracy_score, mean_squared_error +from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin from . import _utils from ._estimator import Estimator +from .utils.kfoldwrapper import KFoldWrapper def _build_estimator( @@ -20,7 +28,8 @@ def _build_estimator( oob_decision_function, partial_mode=True, buffer=None, - verbose=1 + verbose=1, + sample_weight=None, ): """Private function used to fit a single estimator.""" if verbose > 1: @@ -28,7 +37,7 @@ def _build_estimator( key = estimator_name + "_" + str(estimator_idx) print(msg.format(_utils.ctime(), key, layer_idx)) - X_aug_train = estimator.fit_transform(X, y) + X_aug_train = estimator.fit_transform(X, y, sample_weight) oob_decision_function += estimator.oob_decision_function_ if partial_mode: @@ -41,16 +50,18 @@ def _build_estimator( return X_aug_train, estimator -class Layer(object): - +class BaseCascadeLayer(BaseEstimator): def __init__( self, layer_idx, - n_classes, + n_outputs, + criterion, n_estimators=2, n_trees=100, max_depth=None, + min_samples_split=2, min_samples_leaf=1, + backend="custom", partial_mode=False, buffer=None, n_jobs=None, @@ -58,17 +69,19 @@ def __init__( verbose=1, ): self.layer_idx = layer_idx - self.n_classes = n_classes + self.n_outputs = n_outputs + self.criterion = criterion self.n_estimators = n_estimators * 2 # internal conversion self.n_trees = n_trees self.max_depth = max_depth + self.min_samples_split = min_samples_split self.min_samples_leaf = min_samples_leaf + self.backend = backend self.partial_mode = partial_mode self.buffer = buffer self.n_jobs = n_jobs self.random_state = random_state self.verbose = verbose - # Internal container self.estimators_ = {} @@ -76,23 +89,41 @@ def __init__( def n_trees_(self): return self.n_estimators * self.n_trees + @property + def feature_importances_(self): + feature_importances_ = np.zeros((self.n_features,)) + for idx, (key, estimator) in enumerate(self.estimators_.items()): + # Partial mode + if isinstance(estimator, str): + estimator_ = self.buffer.load_estimator(estimator) + feature_importances_ += estimator_.feature_importances_ + # In-memory mode + else: + feature_importances_ += estimator.feature_importances_ + + return feature_importances_ / len(self.estimators_) + def _make_estimator(self, estimator_idx, estimator_name): """Make and configure a copy of the estimator.""" # Set the non-overlapped random state if self.random_state is not None: - random_state = (self.random_state + - 10 * estimator_idx + - 100 * self.layer_idx) + random_state = ( + self.random_state + 10 * estimator_idx + 100 * self.layer_idx + ) else: random_state = None estimator = Estimator( name=estimator_name, + criterion=self.criterion, n_trees=self.n_trees, max_depth=self.max_depth, + min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, + backend=self.backend, n_jobs=self.n_jobs, - random_state=random_state + random_state=random_state, + is_classifier=is_classifier(self), ) return estimator @@ -107,13 +138,73 @@ def _validate_params(self): msg = "`n_trees` = {} should be strictly positive." raise ValueError(msg.format(self.n_trees)) - def fit_transform(self, X, y): + def transform(self, X): + """Preserved for the naming consistency.""" + return self.predict_full(X) - self._validate_params() + def predict_full(self, X): + """Return the concatenated predictions from all base estimators.""" n_samples, _ = X.shape + pred = np.zeros((n_samples, self.n_outputs * self.n_estimators)) + for idx, (key, estimator) in enumerate(self.estimators_.items()): + if self.verbose > 1: + msg = "{} - Evaluating estimator = {:<5} in layer = {}" + key = key.split("-")[-1] + "_" + str(key.split("-")[-2]) + print(msg.format(_utils.ctime(), key, self.layer_idx)) + if self.partial_mode: + # Load the estimator from the buffer + estimator = self.buffer.load_estimator(estimator) + + left, right = self.n_outputs * idx, self.n_outputs * (idx + 1) + pred[:, left:right] += estimator.predict(X) + + return pred + + +class ClassificationCascadeLayer(BaseCascadeLayer, ClassifierMixin): + """Implementation of the cascade forest layer for classification.""" + + def __init__( + self, + layer_idx, + n_outputs, + criterion, + n_estimators=2, + n_trees=100, + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + backend="custom", + partial_mode=False, + buffer=None, + n_jobs=None, + random_state=None, + verbose=1, + ): + super().__init__( + layer_idx=layer_idx, + n_outputs=n_outputs, + criterion=criterion, + n_estimators=n_estimators, + n_trees=n_trees, + max_depth=max_depth, + min_samples_split=min_samples_split, + min_samples_leaf=min_samples_leaf, + backend=backend, + partial_mode=partial_mode, + buffer=buffer, + n_jobs=n_jobs, + random_state=random_state, + verbose=verbose, + ) + + def fit_transform(self, X, y, sample_weight=None): + + self._validate_params() + n_samples, self.n_features = X.shape X_aug = [] - oob_decision_function = np.zeros((n_samples, self.n_classes)) + oob_decision_function = np.zeros((n_samples, self.n_outputs)) # A random forest and an extremely random forest will be fitted for estimator_idx in range(self.n_estimators // 2): @@ -128,6 +219,7 @@ def fit_transform(self, X, y): self.partial_mode, self.buffer, self.verbose, + sample_weight, ) X_aug.append(X_aug_) key = "{}-{}-{}".format(self.layer_idx, estimator_idx, "rf") @@ -145,6 +237,7 @@ def fit_transform(self, X, y): self.partial_mode, self.buffer, self.verbose, + sample_weight, ) X_aug.append(X_aug_) key = "{}-{}-{}".format(self.layer_idx, estimator_idx, "erf") @@ -153,45 +246,207 @@ def fit_transform(self, X, y): # Set the OOB estimations and validation accuracy self.oob_decision_function_ = oob_decision_function / self.n_estimators y_pred = np.argmax(oob_decision_function, axis=1) - self.val_acc_ = accuracy_score(y, y_pred) + self.val_performance_ = accuracy_score( + y, y_pred, sample_weight=sample_weight + ) X_aug = np.hstack(X_aug) return X_aug - def transform(self, X): - """ - Return the concatenated transformation results from all base - estimators.""" + +class RegressionCascadeLayer(BaseCascadeLayer, RegressorMixin): + """Implementation of the cascade forest layer for regression.""" + + def __init__( + self, + layer_idx, + n_outputs, + criterion, + n_estimators=2, + n_trees=100, + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + backend="custom", + partial_mode=False, + buffer=None, + n_jobs=None, + random_state=None, + verbose=1, + ): + super().__init__( + layer_idx=layer_idx, + n_outputs=n_outputs, + criterion=criterion, + n_estimators=n_estimators, + n_trees=n_trees, + max_depth=max_depth, + min_samples_split=min_samples_split, + min_samples_leaf=min_samples_leaf, + backend=backend, + partial_mode=partial_mode, + buffer=buffer, + n_jobs=n_jobs, + random_state=random_state, + verbose=verbose, + ) + + def fit_transform(self, X, y, sample_weight=None): + + self._validate_params() + n_samples, self.n_features = X.shape + + X_aug = [] + oob_decision_function = np.zeros((n_samples, self.n_outputs)) + + # A random forest and an extremely random forest will be fitted + for estimator_idx in range(self.n_estimators // 2): + X_aug_, _estimator = _build_estimator( + X, + y, + self.layer_idx, + estimator_idx, + "rf", + self._make_estimator(estimator_idx, "rf"), + oob_decision_function, + self.partial_mode, + self.buffer, + self.verbose, + sample_weight, + ) + X_aug.append(X_aug_) + key = "{}-{}-{}".format(self.layer_idx, estimator_idx, "rf") + self.estimators_.update({key: _estimator}) + + for estimator_idx in range(self.n_estimators // 2): + X_aug_, _estimator = _build_estimator( + X, + y, + self.layer_idx, + estimator_idx, + "erf", + self._make_estimator(estimator_idx, "erf"), + oob_decision_function, + self.partial_mode, + self.buffer, + self.verbose, + sample_weight, + ) + X_aug.append(X_aug_) + key = "{}-{}-{}".format(self.layer_idx, estimator_idx, "erf") + self.estimators_.update({key: _estimator}) + + # Set the OOB estimations and validation mean squared error + self.oob_decision_function_ = oob_decision_function / self.n_estimators + y_pred = self.oob_decision_function_ + self.val_performance_ = mean_squared_error( + y, y_pred, sample_weight=sample_weight + ) + + X_aug = np.hstack(X_aug) + return X_aug + + +class CustomCascadeLayer(object): + """Implementation of the cascade layer for customized base estimators.""" + + def __init__( + self, + layer_idx, + n_splits, + n_outputs, + estimators, + partial_mode=False, + buffer=None, + random_state=None, + verbose=1, + is_classifier=True, + ): + self.layer_idx = layer_idx + self.n_splits = n_splits + self.n_outputs = n_outputs + self.n_estimators = len(estimators) + self.dummy_estimators_ = estimators + self.partial_mode = partial_mode + self.buffer = buffer + self.random_state = random_state + self.verbose = verbose + self.is_classifier = is_classifier + # Internal container + self.estimators_ = {} + + def fit_transform(self, X, y, sample_weight=None): n_samples, _ = X.shape - X_aug = np.zeros((n_samples, self.n_classes * self.n_estimators)) - for idx, (key, estimator) in enumerate(self.estimators_.items()): + X_aug = [] + + # Parameters were already validated by upstream methods + for estimator_idx, estimator in enumerate(self.dummy_estimators_): + kfold_estimator = KFoldWrapper( + estimator, + self.n_splits, + self.n_outputs, + self.random_state, + self.verbose, + self.is_classifier, + ) + if self.verbose > 1: - msg = "{} - Evaluating estimator = {:<5} in layer = {}" - key = key.split('-')[-1] + "_" + str(key.split('-')[-2]) - print(msg.format(_utils.ctime(), key, self.layer_idx)) + msg = "{} - Fitting estimator = custom_{} in layer = {}" + print( + msg.format(_utils.ctime(), estimator_idx, self.layer_idx) + ) + + kfold_estimator.fit_transform(X, y, sample_weight) + X_aug.append(kfold_estimator.oob_decision_function_) + key = "{}-{}-custom".format(self.layer_idx, estimator_idx) + if self.partial_mode: - # Load the estimator from the buffer - estimator = self.buffer.load_estimator(estimator) + # Cache the fitted estimator in out-of-core mode + buffer_path = self.buffer.cache_estimator( + self.layer_idx, estimator_idx, "custom", kfold_estimator + ) + self.estimators_.update({key: buffer_path}) + else: + self.estimators_.update({key: kfold_estimator}) + + # Set the OOB estimations and validation performance + oob_decision_function = np.zeros_like(X_aug[0]) + for estimator_oob_decision_function in X_aug: + oob_decision_function += ( + estimator_oob_decision_function / self.n_estimators + ) - left, right = self.n_classes*idx, self.n_classes*(idx+1) - X_aug[:, left:right] += estimator.transform(X) + if self.is_classifier: # classification + y_pred = np.argmax(oob_decision_function, axis=1) + self.val_performance_ = accuracy_score( + y, y_pred, sample_weight=sample_weight + ) + else: # regression + y_pred = oob_decision_function + self.val_performance_ = mean_squared_error( + y, y_pred, sample_weight=sample_weight + ) + X_aug = np.hstack(X_aug) return X_aug + def transform(self, X): + """Preserved for the naming consistency.""" + return self.predict_full(X) + def predict_full(self, X): """Return the concatenated predictions from all base estimators.""" n_samples, _ = X.shape - pred = np.zeros((n_samples, self.n_classes * self.n_estimators)) + pred = np.zeros((n_samples, self.n_outputs * self.n_estimators)) for idx, (key, estimator) in enumerate(self.estimators_.items()): if self.verbose > 1: - msg = "{} - Evaluating estimator = {:<5} in layer = {}" - key = key.split('-')[-1] + "_" + str(key.split('-')[-2]) - print(msg.format(_utils.ctime(), key, self.layer_idx)) + msg = "{} - Evaluating estimator = custom_{} in layer = {}" + print(msg.format(_utils.ctime(), idx, self.layer_idx)) if self.partial_mode: # Load the estimator from the buffer estimator = self.buffer.load_estimator(estimator) - left, right = self.n_classes*idx, self.n_classes*(idx+1) + left, right = self.n_outputs * idx, self.n_outputs * (idx + 1) pred[:, left:right] += estimator.predict(X) return pred diff --git a/deepforest/_utils.py b/deepforest/_utils.py index 5fa7d21..c985edd 100644 --- a/deepforest/_utils.py +++ b/deepforest/_utils.py @@ -36,7 +36,7 @@ def init_array(X, n_aug_features): n_samples, n_features = X.shape n_dims = n_features + n_aug_features X_middle = np.zeros((n_samples, n_dims), dtype=np.uint8) - X_middle[:, : n_features] += X + X_middle[:, :n_features] += X return X_middle @@ -59,5 +59,5 @@ def merge_array(X_middle, X_aug, n_features): def ctime(): """A formatter on current time used for printing running status.""" - ctime = '[' + datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S.%f')[:-3] + ']' + ctime = "[" + datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S.%f")[:-3] + "]" return ctime diff --git a/deepforest/cascade.py b/deepforest/cascade.py index 04484d4..686f311 100644 --- a/deepforest/cascade.py +++ b/deepforest/cascade.py @@ -1,17 +1,30 @@ """Implementation of Deep Forest.""" -__all__ = ["CascadeForestClassifier"] +__all__ = ["CascadeForestClassifier", "CascadeForestRegressor"] -import time import numbers -import numpy as np +import time from abc import ABCMeta, abstractmethod -from . import _utils -from . import _io -from ._layer import Layer +import numpy as np +from sklearn.base import ( + BaseEstimator, + ClassifierMixin, + RegressorMixin, + is_classifier, +) +from sklearn.preprocessing import LabelEncoder +from sklearn.utils import check_array, check_X_y +from sklearn.utils.multiclass import type_of_target + +from . import _io, _utils from ._binner import Binner +from ._layer import ( + ClassificationCascadeLayer, + CustomCascadeLayer, + RegressionCascadeLayer, +) def _get_predictor_kwargs(predictor_kwargs, **kwargs) -> dict: @@ -22,11 +35,13 @@ def _get_predictor_kwargs(predictor_kwargs, **kwargs) -> dict: return predictor_kwargs -def _build_predictor( +def _build_classifier_predictor( predictor_name, + criterion, n_estimators, n_outputs, max_depth=None, + min_samples_split=2, min_samples_leaf=1, n_jobs=None, random_state=None, @@ -38,11 +53,14 @@ def _build_predictor( # Random Forest if predictor_name == "forest": from .forest import RandomForestClassifier + predictor = RandomForestClassifier( **_get_predictor_kwargs( predictor_kwargs, + criterion=criterion, n_estimators=n_estimators, max_depth=max_depth, + min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, n_jobs=n_jobs, random_state=random_state, @@ -103,21 +121,117 @@ def _build_predictor( return predictor -__model_doc = """ +def _build_regressor_predictor( + predictor_name, + criterion, + n_estimators, + n_outputs, + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + n_jobs=None, + random_state=None, + predictor_kwargs={}, +): + """Build the predictor concatenated to the deep forest.""" + predictor_name = predictor_name.lower() + + # Random Forest + if predictor_name == "forest": + from .forest import RandomForestRegressor + + predictor = RandomForestRegressor( + **_get_predictor_kwargs( + predictor_kwargs, + criterion=criterion, + n_estimators=n_estimators, + max_depth=max_depth, + min_samples_split=min_samples_split, + min_samples_leaf=min_samples_leaf, + n_jobs=n_jobs, + random_state=random_state, + ) + ) + # XGBoost + elif predictor_name == "xgboost": + try: + xgb = __import__("xgboost.sklearn") + except ModuleNotFoundError: + msg = ( + "Cannot load the module XGBoost when building the predictor." + " Please make sure that XGBoost is installed." + ) + raise ModuleNotFoundError(msg) + + # The argument `tree_method` is always set as `hist` for XGBoost, + # because the exact mode of XGBoost is too slow. + objective = "reg:squarederror" + predictor = xgb.sklearn.XGBRegressor( + **_get_predictor_kwargs( + predictor_kwargs, + objective=objective, + n_estimators=n_estimators, + tree_method="hist", + n_jobs=n_jobs, + random_state=random_state, + ) + ) + # LightGBM + elif predictor_name == "lightgbm": + try: + lgb = __import__("lightgbm.sklearn") + except ModuleNotFoundError: + msg = ( + "Cannot load the module LightGBM when building the predictor." + " Please make sure that LightGBM is installed." + ) + raise ModuleNotFoundError(msg) + + objective = "regression" + predictor = lgb.LGBMRegressor( + **_get_predictor_kwargs( + predictor_kwargs, + objective=objective, + n_estimators=n_estimators, + n_jobs=n_jobs, + random_state=random_state, + ) + ) + else: + msg = ( + "The name of the predictor should be one of {{forest, xgboost," + " lightgbm}}, but got {} instead." + ) + raise NotImplementedError(msg.format(predictor_name)) + + return predictor + + +__classifier_model_doc = """ Parameters ---------- n_bins : :obj:`int`, default=255 The number of bins used for non-missing values. In addition to the ``n_bins`` bins, one more bin is reserved for missing values. Its value must be no smaller than 2 and no greater than 255. - bin_subsample : :obj:`int`, default=2e5 + bin_subsample : :obj:`int`, default=200,000 The number of samples used to construct feature discrete bins. If the size of training set is smaller than ``bin_subsample``, then all training samples will be used. + bin_type : :obj:`{"percentile", "interval"}`, default= :obj:`"percentile"` + The type of binner used to bin feature values into integer-valued bins. + + - If ``"percentile"``, each bin will have approximately the same + number of distinct feature values. + - If ``"interval"``, each bin will have approximately the same size. max_layers : :obj:`int`, default=20 The maximum number of cascade layers in the deep forest. Notice that the actual number of layers can be smaller than ``max_layers`` because of the internal early stopping stage. + criterion : :obj:`{"gini", "entropy"}`, default= :obj:`"gini"` + The function to measure the quality of a split. Supported criteria + are ``gini`` for the Gini impurity and ``entropy`` for the information + gain. Note: this parameter is tree-specific. n_estimators : :obj:`int`, default=2 The number of estimator in each cascade layer. It will be multiplied by 2 internally because each estimator contains a @@ -127,12 +241,136 @@ def _build_predictor( The number of trees in each estimator. max_depth : :obj:`int`, default=None The maximum depth of each tree. ``None`` indicates no constraint. + min_samples_split : :obj:`int`, default=2 + The minimum number of samples required to split an internal node. + min_samples_leaf : :obj:`int`, default=1 + The minimum number of samples required to be at a leaf node. + use_predictor : :obj:`bool`, default=False + Whether to build the predictor concatenated to the deep forest. Using + the predictor may improve the performance of deep forest. + predictor : :obj:`{"forest", "xgboost", "lightgbm"}`, default= :obj:`"forest"` + The type of the predictor concatenated to the deep forest. If + ``use_predictor`` is False, this parameter will have no effect. + predictor_kwargs : :obj:`dict`, default={} + The configuration of the predictor concatenated to the deep forest. + Specifying this will extend/overwrite the original parameters inherit + from deep forest. If ``use_predictor`` is False, this parameter will + have no effect. + backend : :obj:`{"custom", "sklearn"}`, default= :obj:`"custom"` + The backend of the forest estimator. Supported backends are ``custom`` + for higher time and memory efficiency and ``sklearn`` for additional + functionality. + n_tolerant_rounds : :obj:`int`, default=2 + Specify when to conduct early stopping. The training process + terminates when the validation performance on the training set does + not improve compared against the best validation performance achieved + so far for ``n_tolerant_rounds`` rounds. + delta : :obj:`float`, default=1e-5 + Specify the threshold on early stopping. The counting on + ``n_tolerant_rounds`` is triggered if the performance of a fitted + cascade layer does not improve by ``delta`` compared against the best + validation performance achieved so far. + partial_mode : :obj:`bool`, default=False + Whether to train the deep forest in partial mode. For large + datasets, it is recommended to use the partial mode. + + - If ``True``, the partial mode is activated and all fitted + estimators will be dumped in a local buffer; + - If ``False``, all fitted estimators are directly stored in the + memory. + n_jobs : :obj:`int` or ``None``, default=None + The number of jobs to run in parallel for both :meth:`fit` and + :meth:`predict`. None means 1 unless in a + :obj:`joblib.parallel_backend` context. ``-1`` means using all + processors. + random_state : :obj:`int` or ``None``, default=None + + - If :obj:`int`, ``random_state`` is the seed used by the random + number generator; + - If ``None``, the random number generator is the RandomState + instance used by :mod:`np.random`. + verbose : :obj:`int`, default=1 + Controls the verbosity when fitting and predicting. + + - If ``<= 0``, silent mode, which means no logging information will + be displayed; + - If ``1``, logging information on the cascade layer level will be + displayed; + - If ``> 1``, full logging information will be displayed. +""" + + +__classifier_fit_doc = """ + + .. note:: + + Deep forest supports two kinds of modes for training: + + - **Full memory mode**, in which the training / testing data and + all fitted estimators are directly stored in the memory. + - **Partial mode**, in which after fitting each estimator using + the training data, it will be dumped in the buffer. During the + evaluating stage, the dumped estimators are reloaded into the + memory sequentially to evaluate the testing data. + + By setting the ``partial_mode`` to ``True``, the partial mode is + activated, and a local buffer will be created at the current + directory. The partial mode is able to reduce the running memory + cost when training the deep forest. + + Parameters + ---------- + X : :obj: array-like of shape (n_samples, n_features) + The training data. Internally, it will be converted to + ``np.uint8``. + y : :obj:`numpy.ndarray` of shape (n_samples,) + The class labels of input samples. + sample_weight : :obj:`numpy.ndarray` of shape (n_samples,), default=None + Sample weights. If ``None``, then samples are equally weighted. +""" + +__regressor_model_doc = """ + Parameters + ---------- + n_bins : :obj:`int`, default=255 + The number of bins used for non-missing values. In addition to the + ``n_bins`` bins, one more bin is reserved for missing values. Its + value must be no smaller than 2 and no greater than 255. + bin_subsample : :obj:`int`, default=200,000 + The number of samples used to construct feature discrete bins. If + the size of training set is smaller than ``bin_subsample``, then all + training samples will be used. + bin_type : :obj:`{"percentile", "interval"}`, default= :obj:`"percentile"` + The type of binner used to bin feature values into integer-valued bins. + + - If ``"percentile"``, each bin will have approximately the same + number of distinct feature values. + - If ``"interval"``, each bin will have approximately the same size. + max_layers : :obj:`int`, default=20 + The maximum number of cascade layers in the deep forest. Notice that + the actual number of layers can be smaller than ``max_layers`` because + of the internal early stopping stage. + criterion : :obj:`{"mse", "mae"}`, default= :obj:`"mse"` + The function to measure the quality of a split. Supported criteria are + ``mse`` for the mean squared error, which is equal to variance reduction + as feature selection criterion, and ``mae`` for the mean absolute error. + n_estimators : :obj:`int`, default=2 + The number of estimator in each cascade layer. It will be multiplied + by 2 internally because each estimator contains a + :class:`RandomForestRegressor` and a :class:`ExtraTreesRegressor`, + respectively. + n_trees : :obj:`int`, default=100 + The number of trees in each estimator. + max_depth : :obj:`int`, default=None + The maximum depth of each tree. ``None`` indicates no constraint. + min_samples_split : :obj:`int`, default=2 + The minimum number of samples required to split an internal node. min_samples_leaf : :obj:`int`, default=1 The minimum number of samples required to be at a leaf node. use_predictor : :obj:`bool`, default=False Whether to build the predictor concatenated to the deep forest. Using the predictor may improve the performance of deep forest. - predictor : :obj:`{"forest", "xgboost", "lightgbm"}`, default="forest" + predictor : :obj:`{"forest", "xgboost", "lightgbm"}`, default= :obj:`"forest"` The type of the predictor concatenated to the deep forest. If ``use_predictor`` is False, this parameter will have no effect. predictor_kwargs : :obj:`dict`, default={} @@ -140,6 +378,10 @@ def _build_predictor( Specifying this will extend/overwrite the original parameters inherit from deep forest. If ``use_predictor`` is False, this parameter will have no effect. + backend : :obj:`{"custom", "sklearn"}`, default= :obj:`"custom"` + The backend of the forest estimator. Supported backends are ``custom`` + for higher time and memory efficiency and ``sklearn`` for additional + functionality. n_tolerant_rounds : :obj:`int`, default=2 Specify when to conduct early stopping. The training process terminates when the validation performance on the training set does @@ -164,7 +406,8 @@ def _build_predictor( :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. random_state : :obj:`int` or ``None``, default=None - - If :obj:``int``, ``random_state`` is the seed used by the random + + - If :obj:`int`, ``random_state`` is the seed used by the random number generator; - If ``None``, the random number generator is the RandomState instance used by :mod:`np.random`. @@ -178,50 +421,104 @@ def _build_predictor( - If ``> 1``, full logging information will be displayed. """ +__regressor_fit_doc = """ + + .. note:: + + Deep forest supports two kinds of modes for training: + + - **Full memory mode**, in which the training / testing data and + all fitted estimators are directly stored in the memory. + - **Partial mode**, in which after fitting each estimator using + the training data, it will be dumped in the buffer. During the + evaluating stage, the dumped estimators are reloaded into the + memory sequentially to evaluate the testing data. + + By setting the ``partial_mode`` to ``True``, the partial mode is + activated, and a local buffer will be created at the current + directory. The partial mode is able to reduce the running memory + cost when training the deep forest. + + Parameters + ---------- + X : :obj: array-like of shape (n_samples, n_features) + The training data. Internally, it will be converted to + ``np.uint8``. + y : :obj:`numpy.ndarray` of shape (n_samples,) or (n_samples, n_outputs) + The target values of input samples. + sample_weight : :obj:`numpy.ndarray` of shape (n_samples,), default=None + Sample weights. If ``None``, then samples are equally weighted. +""" + + +def deepforest_model_doc(header, item): + """ + Decorator on obtaining documentation for deep forest models. + + Parameters + ---------- + header: string + Introduction to the decorated class or method. + item : string + Type of the docstring item. + """ + + def get_doc(item): + """Return the selected item.""" + __doc = { + "regressor_model": __regressor_model_doc, + "regressor_fit": __regressor_fit_doc, + "classifier_model": __classifier_model_doc, + "classifier_fit": __classifier_fit_doc, + } + + return __doc[item] -def deepforest_model_doc(header): - """Decorator on obtaining documentation for deep forest models.""" def adddoc(cls): doc = [header + "\n\n"] - doc.extend([__model_doc]) + doc.extend(get_doc(item)) cls.__doc__ = "".join(doc) - return cls return adddoc -class BaseCascadeForest(metaclass=ABCMeta): - +class BaseCascadeForest(BaseEstimator, metaclass=ABCMeta): def __init__( self, n_bins=255, - bin_subsample=2e5, + bin_subsample=200000, bin_type="percentile", max_layers=20, + criterion="", n_estimators=2, n_trees=100, max_depth=None, + min_samples_split=2, min_samples_leaf=1, use_predictor=False, predictor="forest", predictor_kwargs={}, + backend="custom", n_tolerant_rounds=2, delta=1e-5, partial_mode=False, n_jobs=None, random_state=None, - verbose=1 + verbose=1, ): self.n_bins = n_bins self.bin_subsample = bin_subsample self.bin_type = bin_type self.max_layers = max_layers + self.criterion = criterion self.n_estimators = n_estimators self.n_trees = n_trees self.max_depth = max_depth + self.min_samples_split = min_samples_split self.min_samples_leaf = min_samples_leaf self.predictor_kwargs = predictor_kwargs + self.backend = backend self.n_tolerant_rounds = n_tolerant_rounds self.delta = delta self.partial_mode = partial_mode @@ -240,7 +537,7 @@ def __init__( # Predictor self.use_predictor = use_predictor - self.predictor_name = predictor + self.predictor = predictor def __len__(self): return self.n_layers_ @@ -250,8 +547,34 @@ def __getitem__(self, index): def _get_n_output(self, y): """Return the number of output inferred from the training labels.""" - n_output = np.unique(y).shape[0] # classification - return n_output + if is_classifier(self): + n_output = np.unique(y).shape[0] # classification + return n_output + return y.shape[1] if len(y.shape) > 1 else 1 # regression + + def _make_layer(self, **layer_args): + """Make and configure a cascade layer.""" + if not hasattr(self, "use_custom_estimator"): + # Use built-in cascade layers + if is_classifier(self): + layer = ClassificationCascadeLayer(**layer_args) + else: + layer = RegressionCascadeLayer(**layer_args) + else: + # Use customized cascade layers + layer = CustomCascadeLayer( + layer_args["layer_idx"], + self.n_splits, + layer_args["n_outputs"], + self.dummy_estimators, + layer_args["partial_mode"], + layer_args["buffer"], + layer_args["random_state"], + layer_args["verbose"], + is_classifier(self), + ) + + return layer def _get_layer(self, layer_idx): """Get the layer from the internal container according to the index.""" @@ -260,7 +583,7 @@ def _get_layer(self, layer_idx): "The layer index should be in the range [0, {}], but got {}" " instead." ) - raise ValueError(msg.format(self.n_layers_ - 1, layer_idx)) + raise IndexError(msg.format(self.n_layers_ - 1, layer_idx)) layer_key = "layer_{}".format(layer_idx) @@ -271,8 +594,10 @@ def _set_layer(self, layer_idx, layer): Register a layer into the internal container with the given index.""" layer_key = "layer_{}".format(layer_idx) if layer_key in self.layers_: - msg = ("Layer with the key {} already exists in the internal" - " container.") + msg = ( + "Layer with the key {} already exists in the internal" + " container." + ) raise RuntimeError(msg.format(layer_key)) self.layers_.update({layer_key: layer}) @@ -295,8 +620,10 @@ def _set_binner(self, binner_idx, binner): Register a binner into the internal container with the given index.""" binner_key = "binner_{}".format(binner_idx) if binner_key in self.binners_: - msg = ("Binner with the key {} already exists in the internal" - " container.") + msg = ( + "Binner with the key {} already exists in the internal" + " container." + ) raise RuntimeError(msg.format(binner_key)) self.binners_.update({binner_key: binner}) @@ -318,8 +645,10 @@ def _set_n_trees(self, layer_idx): n_trees = 100 * (layer_idx + 1) return n_trees if n_trees <= 500 else 500 else: - msg = ("Invalid value for n_trees. Allowed values are integers or" - " 'auto'.") + msg = ( + "Invalid value for n_trees. Allowed values are integers or" + " 'auto'." + ) raise ValueError(msg) def _check_input(self, X, y=None): @@ -343,6 +672,10 @@ def _validate_params(self): msg = "max_layers = {} should be strictly positive." raise ValueError(msg.format(self.max_layers)) + if not self.backend in ("custom", "sklearn"): + msg = "backend = {} should be one of {{custom, sklearn}}." + raise ValueError(msg.format(self.backend)) + if not self.n_tolerant_rounds > 0: msg = "n_tolerant_rounds = {} should be strictly positive." raise ValueError(msg.format(self.n_tolerant_rounds)) @@ -366,8 +699,10 @@ def _bin_data(self, binner, X, is_training_data=True): binning_time = toc - tic if self.verbose > 1: - msg = ("{} Binning {} data: {:.3f} MB => {:.3f} MB |" - " Elapsed = {:.3f} s") + msg = ( + "{} Binning {} data: {:.3f} MB => {:.3f} MB |" + " Elapsed = {:.3f} s" + ) print( msg.format( _utils.ctime(), @@ -404,6 +739,13 @@ def _handle_early_stopping(self): msg = "{} The optimal number of layers: {}" print(msg.format(_utils.ctime(), self.n_layers_)) + def _if_improved(self, new_pivot, pivot, delta): + """ + Return true if new validation result is better than previous""" + if is_classifier(self): + return new_pivot >= pivot + delta + return new_pivot <= pivot - delta + @abstractmethod def _repr_performance(self, pivot): """Format the printting information on training performance.""" @@ -412,43 +754,28 @@ def _repr_performance(self, pivot): def predict(self, X): """ Predict class labels or regression values for X. - For classification, the predicted class for each sample in X is returned. For regression, the predicted value based on X is returned. """ @property def n_aug_features_(self): - return 2 * self.n_estimators * self.n_outputs_ - - def fit(self, X, y): - """ - Build a deep forest using the training data. - - .. note:: - - Deep forest supports two kinds of modes for training: - - - **Full memory mode**, in which the training / testing data and - all fitted estimators are directly stored in the memory. - - **Partial mode**, in which after fitting each estimator using - the training data, it will be dumped in the buffer. During the - evaluating stage, the dumped estimators are reloaded into the - memory sequentially to evaluate the testing data. - - By setting the ``partial_mode`` to ``True``, the partial mode is - activated, and a local buffer will be created at the current - directory. The partial mode is able to reduce the running memory - cost when training the deep forest. + if not hasattr(self, "use_custom_estimator"): + return 2 * self.n_estimators * self.n_outputs_ + else: + return self.n_estimators * self.n_outputs_ + + # flake8: noqa: E501 + def fit(self, X, y, sample_weight=None): + X, y = check_X_y( + X, + y, + multi_output=True + if type_of_target(y) + in ("continuous-multioutput", "multiclass-multioutput") + else False, + ) - Parameters - ---------- - X : :obj:`numpy.ndarray` of shape (n_samples, n_features) - The training data. Internally, it will be converted to - ``np.uint8``. - y : :obj:`numpy.ndarray` of shape (n_samples,) - The class labels of input samples. - """ self._check_input(X, y) self._validate_params() n_counter = 0 # a counter controlling the early stopping @@ -457,7 +784,7 @@ def fit(self, X, y): n_bins=self.n_bins, bin_subsample=self.bin_subsample, bin_type=self.bin_type, - random_state=self.random_state + random_state=self.random_state, ) # Bin the training data @@ -472,30 +799,35 @@ def fit(self, X, y): print("{} Start to fit the model:".format(_utils.ctime())) # Build the first cascade layer - layer_ = Layer( - 0, - self.n_outputs_, - self.n_estimators, - self._set_n_trees(0), - self.max_depth, - self.min_samples_leaf, - self.partial_mode, - self.buffer_, - self.n_jobs, - self.random_state, - self.verbose + layer_ = self._make_layer( + layer_idx=0, + n_outputs=self.n_outputs_, + criterion=self.criterion, + n_estimators=self.n_estimators, + n_trees=self._set_n_trees(0), + max_depth=self.max_depth, + min_samples_split=self.min_samples_split, + min_samples_leaf=self.min_samples_leaf, + backend=self.backend, + partial_mode=self.partial_mode, + buffer=self.buffer_, + n_jobs=self.n_jobs, + random_state=self.random_state, + verbose=self.verbose, ) if self.verbose > 0: print("{} Fitting cascade layer = {:<2}".format(_utils.ctime(), 0)) tic = time.time() - X_aug_train_ = layer_.fit_transform(X_train_, y) + X_aug_train_ = layer_.fit_transform( + X_train_, y, sample_weight=sample_weight + ) toc = time.time() training_time = toc - tic # Set the reference performance - pivot = layer_.val_acc_ + pivot = layer_.val_performance_ if self.verbose > 0: msg = "{} layer = {:<2} | {} | Elapsed = {:.3f} s" @@ -504,7 +836,7 @@ def fit(self, X, y): _utils.ctime(), 0, self._repr_performance(pivot), - training_time + training_time, ) ) @@ -531,7 +863,7 @@ def fit(self, X, y): n_bins=self.n_bins, bin_subsample=self.bin_subsample, bin_type=self.bin_type, - random_state=self.random_state + random_state=self.random_state, ) X_binned_aug_train_ = self._bin_data( @@ -539,22 +871,26 @@ def fit(self, X, y): ) X_middle_train_ = _utils.merge_array( - X_middle_train_, X_binned_aug_train_, self.n_features_) + X_middle_train_, X_binned_aug_train_, self.n_features_ + ) # Build a cascade layer layer_idx = self.n_layers_ - layer_ = Layer( - layer_idx, - self.n_outputs_, - self.n_estimators, - self._set_n_trees(layer_idx), - self.max_depth, - self.min_samples_leaf, - self.partial_mode, - self.buffer_, - self.n_jobs, - self.random_state, - self.verbose + layer_ = self._make_layer( + layer_idx=layer_idx, + n_outputs=self.n_outputs_, + criterion=self.criterion, + n_estimators=self.n_estimators, + n_trees=self._set_n_trees(layer_idx), + max_depth=self.max_depth, + min_samples_split=self.min_samples_split, + min_samples_leaf=self.min_samples_leaf, + backend=self.backend, + partial_mode=self.partial_mode, + buffer=self.buffer_, + n_jobs=self.n_jobs, + random_state=self.random_state, + verbose=self.verbose, ) X_middle_train_ = self.buffer_.cache_data( @@ -566,11 +902,13 @@ def fit(self, X, y): print(msg.format(_utils.ctime(), layer_idx)) tic = time.time() - X_aug_train_ = layer_.fit_transform(X_middle_train_, y) + X_aug_train_ = layer_.fit_transform( + X_middle_train_, y, sample_weight=sample_weight + ) toc = time.time() training_time = toc - tic - new_pivot = layer_.val_acc_ + new_pivot = layer_.val_performance_ if self.verbose > 0: msg = "{} layer = {:<2} | {} | Elapsed = {:.3f} s" @@ -579,7 +917,7 @@ def fit(self, X, y): _utils.ctime(), layer_idx, self._repr_performance(new_pivot), - training_time + training_time, ) ) @@ -589,7 +927,7 @@ def fit(self, X, y): # training stage will terminate before reaching the maximum number # of layers. - if new_pivot >= pivot + self.delta: + if self._if_improved(new_pivot, pivot, self.delta): # Update the cascade layer self._set_layer(layer_idx, layer_) @@ -609,9 +947,7 @@ def fit(self, X, y): msg = "{} Early stopping counter: {} out of {}" print( msg.format( - _utils.ctime(), - n_counter, - self.n_tolerant_rounds + _utils.ctime(), n_counter, self.n_tolerant_rounds ) ) @@ -636,22 +972,44 @@ def fit(self, X, y): # Build the predictor if `self.use_predictor` is True if self.use_predictor: - self.predictor_ = _build_predictor( - self.predictor_name, - self.n_trees, - self.n_outputs_, - self.max_depth, - self.min_samples_leaf, - self.n_jobs, - self.random_state, - self.predictor_kwargs, - ) + # Use built-in predictors + if self.predictor in ("forest", "xgboost", "lightgbm"): + if is_classifier(self): + self.predictor_ = _build_classifier_predictor( + self.predictor, + self.criterion, + self.n_trees, + self.n_outputs_, + self.max_depth, + self.min_samples_split, + self.min_samples_leaf, + self.n_jobs, + self.random_state, + self.predictor_kwargs, + ) + else: + self.predictor_ = _build_regressor_predictor( + self.predictor, + self.criterion, + self.n_trees, + self.n_outputs_, + self.max_depth, + self.min_samples_split, + self.min_samples_leaf, + self.n_jobs, + self.random_state, + self.predictor_kwargs, + ) + elif self.predictor == "custom": + if not hasattr(self, "predictor_"): + msg = "Missing predictor after calling `set_predictor`" + raise RuntimeError(msg) binner_ = Binner( n_bins=self.n_bins, bin_subsample=self.bin_subsample, bin_type=self.bin_type, - random_state=self.random_state + random_state=self.random_state, ) X_binned_aug_train_ = self._bin_data( @@ -659,14 +1017,15 @@ def fit(self, X, y): ) X_middle_train_ = _utils.merge_array( - X_middle_train_, X_binned_aug_train_, self.n_features_) + X_middle_train_, X_binned_aug_train_, self.n_features_ + ) if self.verbose > 0: msg = "{} Fitting the concatenated predictor: {}" - print(msg.format(_utils.ctime(), self.predictor_name)) + print(msg.format(_utils.ctime(), self.predictor)) tic = time.time() - self.predictor_.fit(X_middle_train_, y) + self.predictor_.fit(X_middle_train_, y, sample_weight) toc = time.time() if self.verbose > 0: @@ -680,9 +1039,193 @@ def fit(self, X, y): return self + def set_estimator(self, estimators, n_splits=5): + """ + Specify the custom base estimators for cascade layers. + + Parameters + ---------- + estimators : :obj:`list` + A list of your base estimators, will be used in all cascade layers. + n_splits : :obj:`int`, default=5 + The number of folds, must be at least 2. + """ + # Validation check + if not isinstance(estimators, list): + msg = ( + "estimators should be a list that stores instantiated" + " objects of your base estimator." + ) + raise ValueError(msg) + + for idx, estimator in enumerate(estimators): + if not callable(getattr(estimator, "fit", None)): + msg = "The `fit` method of estimator = {} is not callable." + raise AttributeError(msg.format(idx)) + + if is_classifier(self) and not callable( + getattr(estimator, "predict_proba", None) + ): + msg = ( + "The `predict_proba` method of estimator = {} is not" + " callable." + ) + raise AttributeError(msg.format(idx)) + + if not is_classifier(self) and not callable( + getattr(estimator, "predict", None) + ): + msg = "The `predict` method of estimator = {} is not callable." + raise AttributeError(msg.format(idx)) + + if not n_splits >= 2: + msg = "n_splits = {} should be at least 2." + raise ValueError(msg.format(n_splits)) + + self.dummy_estimators = estimators + self.n_splits = n_splits + self.use_custom_estimator = True + + # Update attributes + self.n_estimators = len(estimators) + + def set_predictor(self, predictor): + """ + Specify the custom predictor concatenated to deep forest. + + Parameters + ---------- + predictor : :obj:`object` + The instantiated object of your predictor. + """ + # Validation check + if not callable(getattr(predictor, "fit", None)): + msg = "The `fit` method of the predictor is not callable." + raise AttributeError(msg) + + if is_classifier(self) and not callable( + getattr(predictor, "predict_proba", None) + ): + msg = ( + "The `predict_proba` method of the predictor is not" + " callable." + ) + raise AttributeError(msg) + + if not is_classifier(self) and not callable( + getattr(predictor, "predict", None) + ): + msg = "The `predict` method of the predictor is not callable." + raise AttributeError(msg) + + # Set related attributes + self.predictor = "custom" + self.predictor_ = predictor + self.use_predictor = True + + def get_layer_feature_importances(self, layer_idx): + """ + Return the feature importances of ``layer_idx``-th cascade layer. + + Parameters + ---------- + layer_idx : :obj:`int` + The index of the cascade layer, should be in the range + ``[0, self.n_layers_-1]``. + + Returns + ------- + feature_importances_: :obj:`numpy.ndarray` of shape (n_features,) + The impurity-based feature importances of the cascade layer. + Notice that the number of input features are different between the + first cascade layer and remaining cascade layers. + + + .. note:: + - This method is only applicable when deep forest is built using + the ``sklearn`` backend + - The functionality of this method is not available when using + customized estimators in deep forest. + """ + if self.backend == "custom": + msg = ( + "Please use the sklearn backend to get the feature" + " importances property for each cascade layer." + ) + raise RuntimeError(msg) + layer = self._get_layer(layer_idx) + return layer.feature_importances_ + + def get_estimator(self, layer_idx, est_idx, estimator_type): + """ + Get estimator from a cascade layer in the deep forest. + + Parameters + ---------- + layer_idx : :obj:`int` + The index of the cascade layer, should be in the range + ``[0, self.n_layers_-1]``. + est_idx : :obj:`int` + The index of the estimator, should be in the range + ``[0, self.n_estimators]``. + estimator_type : :obj:`{"rf", "erf", "custom"}` + Specify the forest type. + + - If ``rf``, return the random forest. + - If ``erf``, return the extremely random forest. + - If ``custom``, return the customized estimator, only applicable + when using customized estimators in deep forest via + :meth:`set_estimator`. + + Returns + ------- + estimator : Estimator with the given index. + """ + if not self.is_fitted_: + raise AttributeError("Please fit the model first.") + + # Check the given index + if not 0 <= layer_idx < self.n_layers_: + msg = ( + "`layer_idx` should be in the range [0, {}), but got" + " {} instead." + ) + raise ValueError(msg.format(self.n_layers_, layer_idx)) + + if not 0 <= est_idx < self.n_estimators: + msg = ( + "`est_idx` should be in the range [0, {}), but got" + " {} instead." + ) + raise ValueError(msg.format(self.n_estimators, est_idx)) + + if estimator_type not in ("rf", "erf", "custom"): + msg = ( + "`estimator_type` should be one of {{rf, erf, custom}}," + " but got {} instead." + ) + raise ValueError(msg.format(estimator_type)) + + if estimator_type == "custom" and not self.use_custom_estimator: + msg = ( + "`estimator_type` = {} is only applicable when using" + "customized estimators in deep forest." + ) + raise ValueError(msg.format(estimator_type)) + + layer = self._get_layer(layer_idx) + est_key = "{}-{}-{}".format(layer_idx, est_idx, estimator_type) + estimator = layer.estimators_[est_key] + + # Load the model if in partial mode + if self.partial_mode: + estimator = self.buffer_.load_estimator(estimator) + + return estimator.estimator_ + def save(self, dirname="model"): """ - Save the model to the specified directory. + Save the model to the directory ``dirname``. Parameters ---------- @@ -691,10 +1234,9 @@ def save(self, dirname="model"): .. warning:: - Other methods on model serialization such as :mod:`pickle` or :mod:`joblib` are not recommended, especially when ``partial_mode`` - is set to ``True``. + is set to True. """ # Create the output directory _io.model_mkdir(dirname) @@ -702,6 +1244,7 @@ def save(self, dirname="model"): # Save each object sequentially d = {} d["n_estimators"] = self.n_estimators + d["criterion"] = self.criterion d["n_layers"] = self.n_layers_ d["n_features"] = self.n_features_ d["n_outputs"] = self.n_outputs_ @@ -709,8 +1252,19 @@ def save(self, dirname="model"): d["buffer"] = self.buffer_ d["verbose"] = self.verbose d["use_predictor"] = self.use_predictor + d["is_classifier"] = is_classifier(self) + d["use_custom_estimator"] = ( + True if hasattr(self, "use_custom_estimator") else False + ) + if self.use_predictor: - d["predictor_name"] = self.predictor_name + d["predictor"] = self.predictor + + # Save label encoder if labels are encoded. + if hasattr(self, "labels_are_encoded"): + d["labels_are_encoded"] = self.labels_are_encoded + d["label_encoder"] = self.label_encoder_ + _io.model_saveobj(dirname, "param", d) _io.model_saveobj(dirname, "binner", self.binners_) _io.model_saveobj(dirname, "layer", self.layers_, self.partial_mode) @@ -722,7 +1276,7 @@ def save(self, dirname="model"): def load(self, dirname): """ - Load the model from the specified directory. + Load the model from the directory ``dirname``. Parameters ---------- @@ -731,7 +1285,6 @@ def load(self, dirname): .. note:: - The dumped model after calling :meth:`load_model` is not exactly the same as the model before saving, because many objects irrelevant to model inference will not be saved. @@ -744,8 +1297,16 @@ def load(self, dirname): self.n_features_ = d["n_features"] self.n_outputs_ = d["n_outputs"] self.partial_mode = d["partial_mode"] + self.buffer_ = d["buffer"] self.verbose = d["verbose"] self.use_predictor = d["use_predictor"] + if d["use_custom_estimator"]: + self.use_custom_estimator = True + + # Load label encoder if labels are encoded. + if "labels_are_encoded" in d: + self.labels_are_encoded = d["labels_are_encoded"] + self.label_encoder_ = d["label_encoder"] # Load internal containers self.binners_ = _io.model_loadobj(dirname, "binner") @@ -755,39 +1316,134 @@ def load(self, dirname): # Some checks after loading if len(self.layers_) != self.n_layers_: - msg = ("The size of the loaded dictionary of layers {} does not" - " match n_layers_ {}.") + msg = ( + "The size of the loaded dictionary of layers {} does not" + " match n_layers_ {}." + ) raise RuntimeError(msg.format(len(self.layers_), self.n_layers_)) self.is_fitted_ = True def clean(self): - """ - Clean the buffer created by the model if ``partial_mode`` is ``True``. - """ + """Clean the buffer created by the model.""" if self.partial_mode: self.buffer_.close() @deepforest_model_doc( - """Implementation of the deep forest for classification.""" + """Implementation of the deep forest for classification.""", + "classifier_model", ) -class CascadeForestClassifier(BaseCascadeForest): +class CascadeForestClassifier(BaseCascadeForest, ClassifierMixin): + def __init__( + self, + n_bins=255, + bin_subsample=200000, + bin_type="percentile", + max_layers=20, + criterion="gini", + n_estimators=2, + n_trees=100, + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + use_predictor=False, + predictor="forest", + predictor_kwargs={}, + backend="custom", + n_tolerant_rounds=2, + delta=1e-5, + partial_mode=False, + n_jobs=None, + random_state=None, + verbose=1, + ): + super().__init__( + n_bins=n_bins, + bin_subsample=bin_subsample, + bin_type=bin_type, + max_layers=max_layers, + criterion=criterion, + n_estimators=n_estimators, + n_trees=n_trees, + max_depth=max_depth, + min_samples_split=min_samples_split, + min_samples_leaf=min_samples_leaf, + use_predictor=use_predictor, + predictor=predictor, + predictor_kwargs=predictor_kwargs, + backend=backend, + n_tolerant_rounds=n_tolerant_rounds, + delta=delta, + partial_mode=partial_mode, + n_jobs=n_jobs, + random_state=random_state, + verbose=verbose, + ) + + # Used to deal with classification labels + self.labels_are_encoded = False + self.type_of_target_ = None + self.label_encoder_ = None + + def _encode_class_labels(self, y): + """ + Fit the internal label encoder and return encoded labels. + """ + self.type_of_target_ = type_of_target(y) + if self.type_of_target_ in ("binary", "multiclass"): + self.labels_are_encoded = True + self.label_encoder_ = LabelEncoder() + encoded_y = self.label_encoder_.fit_transform(y) + else: + msg = ( + "CascadeForestClassifier is used for binary and multiclass" + " classification, wheras the training labels seem not to" + " be any one of them." + ) + raise ValueError(msg) + + return encoded_y + + def _decode_class_labels(self, y): + """ + Transform the predicted labels back to original encoding. + """ + if self.labels_are_encoded: + decoded_y = self.label_encoder_.inverse_transform(y) + else: + decoded_y = y - def __init__(self, **kwargs): - super().__init__(**kwargs) + return decoded_y def _repr_performance(self, pivot): msg = "Val Acc = {:.3f} %" return msg.format(pivot * 100) + @deepforest_model_doc( + """Build a deep forest using the training data.""", "classifier_fit" + ) + def fit(self, X, y, sample_weight=None): + X, y = check_X_y( + X, + y, + multi_output=True + if type_of_target(y) + in ("continuous-multioutput", "multiclass-multioutput") + else False, + ) + # Check the input for classification + y = self._encode_class_labels(y) + + super().fit(X, y, sample_weight) + def predict_proba(self, X): """ Predict class probabilities for X. Parameters ---------- - X : :obj:`numpy.ndarray` of shape (n_samples, n_features) + X : :obj: array-like of shape (n_samples, n_features) The input samples. Internally, its dtype will be converted to ``np.uint8``. @@ -796,6 +1452,8 @@ def predict_proba(self, X): proba : :obj:`numpy.ndarray` of shape (n_samples, n_classes) The class probabilities of the input samples. """ + X = check_array(X) + if not self.is_fitted_: raise AttributeError("Please fit the model first.") self._check_input(X) @@ -822,7 +1480,8 @@ def predict_proba(self, X): binner_, X_aug_test_, is_training_data=False ) X_middle_test_ = _utils.merge_array( - X_middle_test_, X_aug_test_, self.n_features_) + X_middle_test_, X_aug_test_, self.n_features_ + ) X_aug_test_ = layer.transform(X_middle_test_) else: binner_ = self._get_binner(layer_idx) @@ -830,7 +1489,8 @@ def predict_proba(self, X): binner_, X_aug_test_, is_training_data=False ) X_middle_test_ = _utils.merge_array( - X_middle_test_, X_aug_test_, self.n_features_) + X_middle_test_, X_aug_test_, self.n_features_ + ) # Skip calling the `transform` if not using the predictor if self.use_predictor: @@ -843,15 +1503,21 @@ def predict_proba(self, X): binner_ = self._get_binner(self.n_layers_) X_aug_test_ = self._bin_data( - binner_, X_aug_test_, is_training_data=False) + binner_, X_aug_test_, is_training_data=False + ) X_middle_test_ = _utils.merge_array( - X_middle_test_, X_aug_test_, self.n_features_) + X_middle_test_, X_aug_test_, self.n_features_ + ) predictor = self.buffer_.load_predictor(self.predictor_) proba = predictor.predict_proba(X_middle_test_) else: - proba = layer.predict_full(X_middle_test_) - proba = _utils.merge_proba(proba, self.n_outputs_) + if self.n_layers_ > 1: + proba = layer.predict_full(X_middle_test_) + proba = _utils.merge_proba(proba, self.n_outputs_) + else: + # Directly merge results with one cascade layer only + proba = _utils.merge_proba(X_aug_test_, self.n_outputs_) return proba @@ -861,7 +1527,7 @@ def predict(self, X): Parameters ---------- - X : :obj:`numpy.ndarray` of shape (n_samples, n_features) + X : :obj: array-like of shape (n_samples, n_features) The input samples. Internally, its dtype will be converted to ``np.uint8``. @@ -870,6 +1536,200 @@ def predict(self, X): y : :obj:`numpy.ndarray` of shape (n_samples,) The predicted classes. """ + X = check_array(X) + proba = self.predict_proba(X) + y = self._decode_class_labels(np.argmax(proba, axis=1)) + return y + + +@deepforest_model_doc( + """Implementation of the deep forest for regression.""", "regressor_model" +) +class CascadeForestRegressor(BaseCascadeForest, RegressorMixin): + def __init__( + self, + n_bins=255, + bin_subsample=200000, + bin_type="percentile", + max_layers=20, + criterion="mse", + n_estimators=2, + n_trees=100, + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + use_predictor=False, + predictor="forest", + predictor_kwargs={}, + backend="custom", + n_tolerant_rounds=2, + delta=1e-5, + partial_mode=False, + n_jobs=None, + random_state=None, + verbose=1, + ): + super().__init__( + n_bins=n_bins, + bin_subsample=bin_subsample, + bin_type=bin_type, + max_layers=max_layers, + criterion=criterion, + n_estimators=n_estimators, + n_trees=n_trees, + max_depth=max_depth, + min_samples_split=min_samples_split, + min_samples_leaf=min_samples_leaf, + use_predictor=use_predictor, + predictor=predictor, + predictor_kwargs=predictor_kwargs, + backend=backend, + n_tolerant_rounds=n_tolerant_rounds, + delta=delta, + partial_mode=partial_mode, + n_jobs=n_jobs, + random_state=random_state, + verbose=verbose, + ) + + # Used to deal with target values + self.type_of_target_ = None + + def _check_target_values(self, y): + """Check the input target values for regressor.""" + self.type_of_target_ = type_of_target(y) + + if not self._check_array_numeric(y): + msg = ( + "CascadeForestRegressor only accepts numeric values as" + " valid target values." + ) + raise ValueError(msg) + + if self.type_of_target_ not in ( + "continuous", + "continuous-multioutput", + "multiclass", + "multiclass-multioutput", + ): + msg = ( + "CascadeForestRegressor is used for univariate or" + " multi-variate regression, but the target values seem not" + " to be one of them." + ) + raise ValueError(msg) + + def _check_array_numeric(self, y): + """Check the input numpy array y is all numeric.""" + numeric_types = np.typecodes["AllInteger"] + np.typecodes["AllFloat"] + if y.dtype.kind in numeric_types: + return True + else: + return False + + def _repr_performance(self, pivot): + msg = "Val MSE = {:.5f}" + return msg.format(pivot) + + @deepforest_model_doc( + """Build a deep forest using the training data.""", "regressor_fit" + ) + def fit(self, X, y, sample_weight=None): + X, y = check_X_y( + X, + y, + multi_output=True + if type_of_target(y) + in ("continuous-multioutput", "multiclass-multioutput") + else False, + ) + + # Check the input for regression + self._check_target_values(y) + + super().fit(X, y, sample_weight) + + def predict(self, X): + """ + Predict regression target for X. + + Parameters + ---------- + X : :obj: array-like of shape (n_samples, n_features) + The input samples. Internally, its dtype will be converted to + ``np.uint8``. + + Returns + ------- + y : :obj:`numpy.ndarray` of shape (n_samples,) or (n_samples, n_outputs) + The predicted values. + """ + X = check_array(X) + + if not self.is_fitted_: + raise AttributeError("Please fit the model first.") + self._check_input(X) + + if self.verbose > 0: + print("{} Start to evalute the model:".format(_utils.ctime())) + + binner_ = self._get_binner(0) + X_test = self._bin_data(binner_, X, is_training_data=False) + X_middle_test_ = _utils.init_array(X_test, self.n_aug_features_) + + for layer_idx in range(self.n_layers_): + layer = self._get_layer(layer_idx) + + if self.verbose > 0: + msg = "{} Evaluating cascade layer = {:<2}" + print(msg.format(_utils.ctime(), layer_idx)) + + if layer_idx == 0: + X_aug_test_ = layer.transform(X_test) + elif layer_idx < self.n_layers_ - 1: + binner_ = self._get_binner(layer_idx) + X_aug_test_ = self._bin_data( + binner_, X_aug_test_, is_training_data=False + ) + X_middle_test_ = _utils.merge_array( + X_middle_test_, X_aug_test_, self.n_features_ + ) + X_aug_test_ = layer.transform(X_middle_test_) + else: + binner_ = self._get_binner(layer_idx) + X_aug_test_ = self._bin_data( + binner_, X_aug_test_, is_training_data=False + ) + X_middle_test_ = _utils.merge_array( + X_middle_test_, X_aug_test_, self.n_features_ + ) + + # Skip calling the `transform` if not using the predictor + if self.use_predictor: + X_aug_test_ = layer.transform(X_middle_test_) + + if self.use_predictor: + + if self.verbose > 0: + print("{} Evaluating the predictor".format(_utils.ctime())) + + binner_ = self._get_binner(self.n_layers_) + X_aug_test_ = self._bin_data( + binner_, X_aug_test_, is_training_data=False + ) + X_middle_test_ = _utils.merge_array( + X_middle_test_, X_aug_test_, self.n_features_ + ) + + predictor = self.buffer_.load_predictor(self.predictor_) + _y = predictor.predict(X_middle_test_) + else: + if self.n_layers_ > 1: + _y = layer.predict_full(X_middle_test_) + _y = _utils.merge_proba(_y, self.n_outputs_) + else: + # Directly merge results with one cascade layer only + _y = _utils.merge_proba(X_aug_test_, self.n_outputs_) - return np.argmax(proba, axis=1) + return _y diff --git a/deepforest/forest.py b/deepforest/forest.py index a670ef9..a917729 100644 --- a/deepforest/forest.py +++ b/deepforest/forest.py @@ -6,8 +6,12 @@ """ -__all__ = ["RandomForestClassifier", - "ExtraTreesClassifier"] +__all__ = [ + "RandomForestClassifier", + "RandomForestRegressor", + "ExtraTreesClassifier", + "ExtraTreesRegressor", +] import numbers from warnings import warn @@ -23,18 +27,22 @@ from sklearn.base import clone from sklearn.base import BaseEstimator from sklearn.base import MetaEstimatorMixin -from sklearn.base import ClassifierMixin, MultiOutputMixin -from sklearn.utils import (check_random_state, - compute_sample_weight) +from sklearn.base import is_classifier +from sklearn.base import ClassifierMixin, RegressorMixin, MultiOutputMixin +from sklearn.utils import check_random_state, compute_sample_weight from sklearn.exceptions import DataConversionWarning -from sklearn.utils.fixes import _joblib_parallel_args from sklearn.utils.validation import check_is_fitted, _check_sample_weight from sklearn.utils.validation import _deprecate_positional_args from . import _cutils as _LIB from . import _forest as _C_FOREST -from .tree import DecisionTreeClassifier, ExtraTreeClassifier +from .tree import ( + DecisionTreeClassifier, + DecisionTreeRegressor, + ExtraTreeClassifier, + ExtraTreeRegressor, +) from .tree._tree import DOUBLE @@ -92,24 +100,33 @@ def _generate_sample_mask(random_state, n_samples, n_samples_bootstrap): def _parallel_build_trees( - tree, - X, - y, - n_samples_bootstrap, - out, - lock + tree, + X, + y, + n_samples_bootstrap, + sample_weight, + out, + mask, + is_classifier, + lock, ): """ Private function used to fit a single tree in parallel.""" n_samples = X.shape[0] - sample_mask = _generate_sample_mask(tree.random_state, n_samples, - n_samples_bootstrap) + sample_mask = _generate_sample_mask( + tree.random_state, n_samples, n_samples_bootstrap + ) # Fit the tree on the bootstrapped samples - feature, threshold, children, value = tree.fit(X[sample_mask], - y[sample_mask], - check_input=False) + if sample_weight is not None: + sample_weight = sample_weight[sample_mask] + feature, threshold, children, value = tree.fit( + X[sample_mask], + y[sample_mask], + sample_weight=sample_weight, + check_input=False, + ) if not children.flags["C_CONTIGUOUS"]: children = np.ascontiguousarray(children) @@ -117,17 +134,20 @@ def _parallel_build_trees( if not value.flags["C_CONTIGUOUS"]: value = np.ascontiguousarray(value) - value = np.squeeze(value, axis=1) - value /= value.sum(axis=1)[:, np.newaxis] + if is_classifier: + value = np.squeeze(value, axis=1) + value /= value.sum(axis=1)[:, np.newaxis] + else: + if len(value.shape) == 3: + value = np.squeeze(value, axis=2) # Set the OOB predictions - oob_prediction = _C_FOREST.predict(X[~sample_mask, :], - feature, - threshold, - children, - value) - + oob_prediction = _C_FOREST.predict( + X[~sample_mask, :], feature, threshold, children, value + ) with lock: + + mask += ~sample_mask out[~sample_mask, :] += oob_prediction return feature, threshold, children, value @@ -165,7 +185,7 @@ def _set_random_states(estimator, random_state=None): random_state = check_random_state(random_state) to_set = {} for key in sorted(estimator.get_params(deep=True)): - if key == 'random_state' or key.endswith('__random_state'): + if key == "random_state" or key.endswith("__random_state"): to_set[key] = random_state.randint(np.iinfo(np.int32).max) if to_set: @@ -179,9 +199,10 @@ def _partition_estimators(n_estimators, n_jobs): n_jobs = min(effective_n_jobs(n_jobs), n_estimators) # Partition estimators between jobs - n_estimators_per_job = np.full(n_jobs, n_estimators // n_jobs, - dtype=np.int) - n_estimators_per_job[:n_estimators % n_jobs] += 1 + n_estimators_per_job = np.full( + n_jobs, n_estimators // n_jobs, dtype=int + ) + n_estimators_per_job[: n_estimators % n_jobs] += 1 starts = np.cumsum(n_estimators_per_job) return n_jobs, n_estimators_per_job.tolist(), [0] + starts.tolist() @@ -190,7 +211,6 @@ def _partition_estimators(n_estimators, n_jobs): def _accumulate_prediction(feature, threshold, children, value, X, out, lock): """This is a utility function for joblib's Parallel.""" prediction = _C_FOREST.predict(X, feature, threshold, children, value) - with lock: if len(out) == 1: out[0] += prediction @@ -231,8 +251,9 @@ class BaseEnsemble(MetaEstimatorMixin, BaseEstimator, metaclass=ABCMeta): _required_parameters: List[str] = [] @abstractmethod - def __init__(self, base_estimator, *, n_estimators=10, - estimator_params=tuple()): + def __init__( + self, base_estimator, *, n_estimators=10, estimator_params=tuple() + ): # Set parameters self.base_estimator = base_estimator self.n_estimators = n_estimators @@ -248,12 +269,16 @@ def _validate_estimator(self, default=None): Sets the base_estimator_` attributes. """ if not isinstance(self.n_estimators, numbers.Integral): - raise ValueError("n_estimators must be an integer, " - "got {0}.".format(type(self.n_estimators))) + raise ValueError( + "n_estimators must be an integer, " + "got {0}.".format(type(self.n_estimators)) + ) if self.n_estimators <= 0: - raise ValueError("n_estimators must be greater than zero, " - "got {0}.".format(self.n_estimators)) + raise ValueError( + "n_estimators must be greater than zero, " + "got {0}.".format(self.n_estimators) + ) if self.base_estimator is not None: self.base_estimator_ = self.base_estimator @@ -270,12 +295,14 @@ def _make_estimator(self, append=True, random_state=None): sub-estimators. """ estimator = clone(self.base_estimator_) - estimator.set_params(**{p: getattr(self, p) - for p in self.estimator_params}) + estimator.set_params( + **{p: getattr(self, p) for p in self.estimator_params} + ) # Pass the inferred class information to avoid redudant finding. - estimator.classes_ = self.classes_ - estimator.n_classes_ = np.array(self.n_classes_, dtype=np.int32) + if is_classifier(estimator): + estimator.classes_ = self.classes_ + estimator.n_classes_ = np.array(self.n_classes_, dtype=np.int32) if random_state is not None: _set_random_states(estimator, random_state) @@ -307,19 +334,23 @@ class BaseForest(MultiOutputMixin, BaseEnsemble, metaclass=ABCMeta): """ @abstractmethod - def __init__(self, - base_estimator, - n_estimators=100, *, - estimator_params=tuple(), - n_jobs=None, - random_state=None, - verbose=0, - class_weight=None, - max_samples=None): + def __init__( + self, + base_estimator, + n_estimators=100, + *, + estimator_params=tuple(), + n_jobs=None, + random_state=None, + verbose=0, + class_weight=None, + max_samples=None + ): super().__init__( base_estimator=base_estimator, n_estimators=n_estimators, - estimator_params=estimator_params) + estimator_params=estimator_params, + ) self.n_jobs = n_jobs self.random_state = random_state @@ -378,10 +409,13 @@ def fit(self, X, y, sample_weight=None): y = np.atleast_1d(y) if y.ndim == 2 and y.shape[1] == 1: - warn("A column-vector y was passed when a 1d array was" - " expected. Please change the shape of y to " - "(n_samples,), for example using ravel().", - DataConversionWarning, stacklevel=2) + warn( + "A column-vector y was passed when a 1d array was" + " expected. Please change the shape of y to " + "(n_samples,), for example using ravel().", + DataConversionWarning, + stacklevel=2, + ) if y.ndim == 1: # reshape is necessary to preserve the data contiguity against vs @@ -394,10 +428,15 @@ def fit(self, X, y, sample_weight=None): if getattr(y, "dtype", None) != DOUBLE or not y.flags.contiguous: y = np.ascontiguousarray(y, dtype=DOUBLE) + if expanded_class_weight is not None: + if sample_weight is not None: + sample_weight = sample_weight * expanded_class_weight + else: + sample_weight = expanded_class_weight + # Get bootstrap sample size n_samples_bootstrap = _get_n_samples_bootstrap( - n_samples=X.shape[0], - max_samples=self.max_samples + n_samples=X.shape[0], max_samples=self.max_samples ) # Check parameters @@ -405,27 +444,40 @@ def fit(self, X, y, sample_weight=None): random_state = check_random_state(self.random_state) n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) - trees = [self._make_estimator(append=False, - random_state=random_state) - for i in range(self.n_estimators)] + trees = [ + self._make_estimator(append=False, random_state=random_state) + for i in range(self.n_estimators) + ] # Pre-allocate OOB estimations - oob_decision_function = np.zeros((n_samples, - self.classes_[0].shape[0])) + if is_classifier(self): + oob_decision_function = np.zeros( + (n_samples, self.classes_[0].shape[0]) + ) + else: + oob_decision_function = np.zeros((n_samples, self.n_outputs_)) + mask = np.zeros(n_samples) lock = threading.Lock() - rets = Parallel(n_jobs=n_jobs, verbose=self.verbose, - **_joblib_parallel_args(prefer='threads', - require="sharedmem"))( + rets = Parallel( + n_jobs=n_jobs, + verbose=self.verbose, + prefer="threads", + require="sharedmem", + )( delayed(_parallel_build_trees)( t, X, y, n_samples_bootstrap, + sample_weight, oob_decision_function, - lock) - for i, t in enumerate(trees)) - + mask, + is_classifier(self), + lock, + ) + for i, t in enumerate(trees) + ) # Collect newly grown trees for feature, threshold, children, value in rets: @@ -437,13 +489,22 @@ def fit(self, X, y, sample_weight=None): self.values.append(value) # Check the OOB predictions - if (oob_decision_function.sum(axis=1) == 0).any(): - warn("Some inputs do not have OOB predictions. " - "This probably means too few trees were used " - "to compute any reliable oob predictions.") - - prediction = (oob_decision_function / - oob_decision_function.sum(axis=1)[:, np.newaxis]) + if ( + is_classifier(self) + and (oob_decision_function.sum(axis=1) == 0).any() + ): + warn( + "Some inputs do not have OOB predictions. " + "This probably means too few trees were used " + "to compute any reliable oob predictions." + ) + if is_classifier(self): + prediction = ( + oob_decision_function + / oob_decision_function.sum(axis=1)[:, np.newaxis] + ) + else: + prediction = oob_decision_function / mask.reshape(-1, 1) self.oob_decision_function_ = prediction @@ -468,15 +529,18 @@ class ForestClassifier(ClassifierMixin, BaseForest, metaclass=ABCMeta): """ @abstractmethod - def __init__(self, - base_estimator, - n_estimators=100, *, - estimator_params=tuple(), - n_jobs=None, - random_state=None, - verbose=0, - class_weight=None, - max_samples=None): + def __init__( + self, + base_estimator, + n_estimators=100, + *, + estimator_params=tuple(), + n_jobs=None, + random_state=None, + verbose=0, + class_weight=None, + max_samples=None + ): super().__init__( base_estimator, n_estimators=n_estimators, @@ -485,7 +549,8 @@ def __init__(self, random_state=random_state, verbose=verbose, class_weight=class_weight, - max_samples=max_samples) + max_samples=max_samples, + ) def _validate_y_class_weight(self, y): @@ -498,31 +563,33 @@ def _validate_y_class_weight(self, y): self.classes_ = [] self.n_classes_ = [] - y_store_unique_indices = np.zeros(y.shape, dtype=np.int) + y_store_unique_indices = np.zeros(y.shape, dtype=int) for k in range(self.n_outputs_): - classes_k, y_store_unique_indices[:, k] = \ - np.unique(y[:, k], return_inverse=True) + classes_k, y_store_unique_indices[:, k] = np.unique( + y[:, k], return_inverse=True + ) self.classes_.append(classes_k) self.n_classes_.append(classes_k.shape[0]) y = y_store_unique_indices if self.class_weight is not None: - valid_presets = ('balanced', 'balanced_subsample') + valid_presets = ("balanced", "balanced_subsample") if isinstance(self.class_weight, str): if self.class_weight not in valid_presets: - raise ValueError('Valid presets for class_weight include ' - '"balanced" and "balanced_subsample".' - 'Given "%s".' - % self.class_weight) + raise ValueError( + "Valid presets for class_weight include " + '"balanced" and "balanced_subsample".' + 'Given "%s".' % self.class_weight + ) - if (self.class_weight != 'balanced_subsample' or - not self.bootstrap): + if self.class_weight != "balanced_subsample" or not self.bootstrap: if self.class_weight == "balanced_subsample": class_weight = "balanced" else: class_weight = self.class_weight - expanded_class_weight = compute_sample_weight(class_weight, - y_original) + expanded_class_weight = compute_sample_weight( + class_weight, y_original + ) return y, expanded_class_weight @@ -537,11 +604,12 @@ def predict_proba(self, X): n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) # Avoid storing the output of every estimator by summing them here - all_proba = [np.zeros((X.shape[0], j), dtype=np.float64) - for j in np.atleast_1d(self.n_classes_)] + all_proba = [ + np.zeros((X.shape[0], j), dtype=np.float64) + for j in np.atleast_1d(self.n_classes_) + ] lock = threading.Lock() - Parallel(n_jobs=n_jobs, verbose=self.verbose, - **_joblib_parallel_args(require="sharedmem"))( + Parallel(n_jobs=n_jobs, verbose=self.verbose, require="sharedmem",)( delayed(_accumulate_prediction)( self.features[i], self.thresholds[i], @@ -549,8 +617,10 @@ def predict_proba(self, X): self.values[i], X, all_proba, - lock) - for i in range(self.n_estimators)) + lock, + ) + for i in range(self.n_estimators) + ) for proba in all_proba: proba /= len(self.features) @@ -562,35 +632,45 @@ def predict_proba(self, X): class RandomForestClassifier(ForestClassifier): - @_deprecate_positional_args - def __init__(self, - n_estimators=100, *, - criterion="gini", - max_depth=None, - min_samples_split=2, - min_samples_leaf=1, - min_weight_fraction_leaf=0., - max_features="sqrt", - min_impurity_decrease=0., - min_impurity_split=None, - n_jobs=None, - random_state=None, - verbose=0, - class_weight=None, - max_samples=None): + def __init__( + self, + n_estimators=100, + *, + criterion="gini", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features="sqrt", + min_impurity_decrease=0.0, + min_impurity_split=None, + n_jobs=None, + random_state=None, + verbose=0, + class_weight=None, + max_samples=None + ): super().__init__( base_estimator=DecisionTreeClassifier(), n_estimators=n_estimators, - estimator_params=("criterion", "max_depth", "min_samples_split", - "min_samples_leaf", "min_weight_fraction_leaf", - "max_features", "min_impurity_decrease", - "min_impurity_split", "random_state"), + estimator_params=( + "criterion", + "max_depth", + "min_samples_split", + "min_samples_leaf", + "min_weight_fraction_leaf", + "max_features", + "min_impurity_decrease", + "min_impurity_split", + "random_state", + ), n_jobs=n_jobs, random_state=random_state, verbose=verbose, class_weight=class_weight, - max_samples=max_samples) + max_samples=max_samples, + ) self.criterion = criterion self.max_depth = max_depth @@ -603,35 +683,244 @@ def __init__(self, class ExtraTreesClassifier(ForestClassifier): - @_deprecate_positional_args - def __init__(self, - n_estimators=100, *, - criterion="gini", - max_depth=None, - min_samples_split=2, - min_samples_leaf=1, - min_weight_fraction_leaf=0., - max_features="sqrt", - min_impurity_decrease=0., - min_impurity_split=None, - n_jobs=None, - random_state=None, - verbose=0, - class_weight=None, - max_samples=None): + def __init__( + self, + n_estimators=100, + *, + criterion="gini", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features="sqrt", + min_impurity_decrease=0.0, + min_impurity_split=None, + n_jobs=None, + random_state=None, + verbose=0, + class_weight=None, + max_samples=None + ): super().__init__( base_estimator=ExtraTreeClassifier(), n_estimators=n_estimators, - estimator_params=("criterion", "max_depth", "min_samples_split", - "min_samples_leaf", "min_weight_fraction_leaf", - "max_features", "min_impurity_decrease", - "min_impurity_split", "random_state"), + estimator_params=( + "criterion", + "max_depth", + "min_samples_split", + "min_samples_leaf", + "min_weight_fraction_leaf", + "max_features", + "min_impurity_decrease", + "min_impurity_split", + "random_state", + ), n_jobs=n_jobs, random_state=random_state, verbose=verbose, class_weight=class_weight, - max_samples=max_samples) + max_samples=max_samples, + ) + + self.criterion = criterion + self.max_depth = max_depth + self.min_samples_split = min_samples_split + self.min_samples_leaf = min_samples_leaf + self.min_weight_fraction_leaf = min_weight_fraction_leaf + self.max_features = max_features + self.min_impurity_decrease = min_impurity_decrease + self.min_impurity_split = min_impurity_split + + +class ForestRegressor(RegressorMixin, BaseForest, metaclass=ABCMeta): + """ + Base class for forest of trees-based regressors. + + Warning: This class should not be used directly. Use derived classes + instead. + """ + + @abstractmethod + def __init__( + self, + base_estimator, + n_estimators=100, + *, + estimator_params=tuple(), + n_jobs=None, + random_state=None, + verbose=0, + max_samples=None + ): + super().__init__( + base_estimator, + n_estimators=n_estimators, + estimator_params=estimator_params, + n_jobs=n_jobs, + random_state=random_state, + verbose=verbose, + max_samples=max_samples, + ) + + def predict(self, X): + """ + Predict regression target for X. + + The predicted regression target of an input sample is computed as the + mean predicted regression targets of the trees in the forest. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The input samples. Internally, its dtype will be converted to + ``dtype=np.float32``. If a sparse matrix is provided, it will be + converted into a sparse ``csr_matrix``. + + Returns + ------- + y : ndarray of shape (n_samples,) or (n_samples, n_outputs) + The predicted values. + """ + check_is_fitted(self) + + # Assign chunk of trees to jobs + n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) + + # avoid storing the output of every estimator by summing them here + y_hat = np.zeros((X.shape[0], self.n_outputs_), dtype=np.float64) + + # Parallel loop + lock = threading.Lock() + Parallel(n_jobs=n_jobs, verbose=self.verbose, require="sharedmem",)( + delayed(_accumulate_prediction)( + self.features[i], + self.thresholds[i], + self.childrens[i], + self.values[i], + X, + [y_hat], + lock, + ) + for i in range(self.n_estimators) + ) + + y_hat /= self.n_estimators + return y_hat + + @staticmethod + def _get_oob_predictions(tree, X): + """Compute the OOB predictions for an individual tree. + + Parameters + ---------- + tree : DecisionTreeRegressor object + A single decision tree regressor. + X : ndarray of shape (n_samples, n_features) + The OOB samples. + + Returns + ------- + y_pred : ndarray of shape (n_samples, 1, n_outputs) + The OOB associated predictions. + """ + y_pred = tree.predict(X, check_input=False) + if y_pred.ndim == 1: + # single output regression + y_pred = y_pred[:, np.newaxis, np.newaxis] + else: + # multioutput regression + y_pred = y_pred[:, np.newaxis, :] + return y_pred + + +class RandomForestRegressor(ForestRegressor): + @_deprecate_positional_args + def __init__( + self, + n_estimators=100, + *, + criterion="mse", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features="auto", + min_impurity_decrease=0.0, + min_impurity_split=None, + n_jobs=None, + random_state=None, + verbose=0, + max_samples=None + ): + super().__init__( + base_estimator=DecisionTreeRegressor(), + n_estimators=n_estimators, + estimator_params=( + "criterion", + "max_depth", + "min_samples_split", + "min_samples_leaf", + "min_weight_fraction_leaf", + "max_features", + "min_impurity_decrease", + "min_impurity_split", + "random_state", + ), + n_jobs=n_jobs, + random_state=random_state, + verbose=verbose, + max_samples=max_samples, + ) + + self.criterion = criterion + self.max_depth = max_depth + self.min_samples_split = min_samples_split + self.min_samples_leaf = min_samples_leaf + self.min_weight_fraction_leaf = min_weight_fraction_leaf + self.max_features = max_features + self.min_impurity_decrease = min_impurity_decrease + self.min_impurity_split = min_impurity_split + + +class ExtraTreesRegressor(ForestRegressor): + @_deprecate_positional_args + def __init__( + self, + n_estimators=100, + *, + criterion="mse", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features="auto", + min_impurity_decrease=0.0, + min_impurity_split=None, + n_jobs=None, + random_state=None, + verbose=0, + max_samples=None + ): + super().__init__( + base_estimator=ExtraTreeRegressor(), + n_estimators=n_estimators, + estimator_params=( + "criterion", + "max_depth", + "min_samples_split", + "min_samples_leaf", + "min_weight_fraction_leaf", + "max_features", + "min_impurity_decrease", + "min_impurity_split", + "random_state", + ), + n_jobs=n_jobs, + random_state=random_state, + verbose=verbose, + max_samples=max_samples, + ) self.criterion = criterion self.max_depth = max_depth diff --git a/deepforest/setup.py b/deepforest/setup.py index 6b516e3..86fc494 100644 --- a/deepforest/setup.py +++ b/deepforest/setup.py @@ -16,20 +16,26 @@ def configuration(parent_package="", top_path=None): config = Configuration("deepforest", parent_package, top_path) config.add_subpackage("tree") - config.add_extension("_forest", - sources=["_forest.pyx"], - include_dirs=[numpy.get_include()], - libraries=libraries, - extra_compile_args=["-O3"]) - - config.add_extension("_cutils", - sources=["_cutils.pyx"], - include_dirs=[numpy.get_include()], - libraries=libraries, - extra_compile_args=["-O3"]) - - msg = ("Please install cython with a version >= {} in order to build a" - " deepforest development version.") + config.add_extension( + "_forest", + sources=["_forest.pyx"], + include_dirs=[numpy.get_include()], + libraries=libraries, + extra_compile_args=["-O3"], + ) + + config.add_extension( + "_cutils", + sources=["_cutils.pyx"], + include_dirs=[numpy.get_include()], + libraries=libraries, + extra_compile_args=["-O3"], + ) + + msg = ( + "Please install cython with a version >= {} in order to build a" + " deepforest development version." + ) msg = msg.format(CYTHON_MIN_VERSION) try: @@ -50,4 +56,5 @@ def configuration(parent_package="", top_path=None): if __name__ == "__main__": from numpy.distutils.core import setup + setup(**configuration().todict()) diff --git a/deepforest/tree/__init__.py b/deepforest/tree/__init__.py index 3240bf8..e1b0767 100644 --- a/deepforest/tree/__init__.py +++ b/deepforest/tree/__init__.py @@ -1,8 +1,13 @@ from .tree import BaseDecisionTree from .tree import DecisionTreeClassifier +from .tree import DecisionTreeRegressor from .tree import ExtraTreeClassifier +from .tree import ExtraTreeRegressor - -__all__ = ["BaseDecisionTree", - "DecisionTreeClassifier", - "ExtraTreeClassifier"] +__all__ = [ + "BaseDecisionTree", + "DecisionTreeClassifier", + "DecisionTreeRegressor", + "ExtraTreeClassifier", + "ExtraTreeRegressor", +] diff --git a/deepforest/tree/_criterion.pyx b/deepforest/tree/_criterion.pyx index 93e9a9c..32bfe7d 100644 --- a/deepforest/tree/_criterion.pyx +++ b/deepforest/tree/_criterion.pyx @@ -190,9 +190,9 @@ cdef class Criterion: self.children_impurity(&impurity_left, &impurity_right) return ((self.weighted_n_node_samples / self.weighted_n_samples) * - (impurity - (self.weighted_n_right / + (impurity - (self.weighted_n_right / self.weighted_n_node_samples * impurity_right) - - (self.weighted_n_left / + - (self.weighted_n_left / self.weighted_n_node_samples * impurity_left))) @@ -719,7 +719,7 @@ cdef class RegressionCriterion(Criterion): self.sum_left = calloc(n_outputs, sizeof(double)) self.sum_right = calloc(n_outputs, sizeof(double)) - if (self.sum_total == NULL or + if (self.sum_total == NULL or self.sum_left == NULL or self.sum_right == NULL): raise MemoryError() @@ -1241,7 +1241,7 @@ cdef class MAE(RegressionCriterion): w = sample_weight[i] impurity_left += fabs(self.y[i, k] - median) * w - p_impurity_left[0] = impurity_left / (self.weighted_n_left * + p_impurity_left[0] = impurity_left / (self.weighted_n_left * self.n_outputs) for k in range(self.n_outputs): @@ -1253,7 +1253,7 @@ cdef class MAE(RegressionCriterion): w = sample_weight[i] impurity_right += fabs(self.y[i, k] - median) * w - p_impurity_right[0] = impurity_right / (self.weighted_n_right * + p_impurity_right[0] = impurity_right / (self.weighted_n_right * self.n_outputs) diff --git a/deepforest/tree/_tree.pyx b/deepforest/tree/_tree.pyx index 6eed475..f43768a 100644 --- a/deepforest/tree/_tree.pyx +++ b/deepforest/tree/_tree.pyx @@ -252,7 +252,7 @@ cdef class DepthFirstTreeBuilder(TreeBuilder): if depth > max_depth_seen: max_depth_seen = depth - if rc >= 0: + if rc >= 0 and tree.internal_node_count > 0: rc = tree._resize_node_c(tree.internal_node_count) if rc >= 0: @@ -463,7 +463,7 @@ cdef class Tree: cdef int _resize_node_c(self, SIZE_t internal_capacity=SIZE_MAX) nogil except -1: """Resize `self.nodes` to `internal_capacity`. - + Returns -1 in case of failure to allocate memory (and raise MemoryError) or 0 otherwise. """ diff --git a/deepforest/tree/setup.py b/deepforest/tree/setup.py index 2465b0c..12f89ae 100644 --- a/deepforest/tree/setup.py +++ b/deepforest/tree/setup.py @@ -6,32 +6,41 @@ def configuration(parent_package="", top_path=None): config = Configuration("tree", parent_package, top_path) libraries = [] - if os.name == 'posix': - libraries.append('m') - config.add_extension("_tree", - sources=["_tree.pyx"], - include_dirs=[numpy.get_include()], - libraries=libraries, - extra_compile_args=["-O3"]) - config.add_extension("_splitter", - sources=["_splitter.pyx"], - include_dirs=[numpy.get_include()], - libraries=libraries, - extra_compile_args=["-O3"]) - config.add_extension("_criterion", - sources=["_criterion.pyx"], - include_dirs=[numpy.get_include()], - libraries=libraries, - extra_compile_args=["-O3"]) - config.add_extension("_utils", - sources=["_utils.pyx"], - include_dirs=[numpy.get_include()], - libraries=libraries, - extra_compile_args=["-O3"]) + if os.name == "posix": + libraries.append("m") + config.add_extension( + "_tree", + sources=["_tree.pyx"], + include_dirs=[numpy.get_include()], + libraries=libraries, + extra_compile_args=["-O3"], + ) + config.add_extension( + "_splitter", + sources=["_splitter.pyx"], + include_dirs=[numpy.get_include()], + libraries=libraries, + extra_compile_args=["-O3"], + ) + config.add_extension( + "_criterion", + sources=["_criterion.pyx"], + include_dirs=[numpy.get_include()], + libraries=libraries, + extra_compile_args=["-O3"], + ) + config.add_extension( + "_utils", + sources=["_utils.pyx"], + include_dirs=[numpy.get_include()], + libraries=libraries, + extra_compile_args=["-O3"], + ) return config if __name__ == "__main__": from numpy.distutils.core import setup + setup(**configuration().todict()) diff --git a/deepforest/tree/tree.py b/deepforest/tree/tree.py index c54ff3e..a73fa12 100644 --- a/deepforest/tree/tree.py +++ b/deepforest/tree/tree.py @@ -6,8 +6,12 @@ """ -__all__ = ["DecisionTreeClassifier", - "ExtraTreeClassifier"] +__all__ = [ + "DecisionTreeClassifier", + "DecisionTreeRegressor", + "ExtraTreeClassifier", + "ExtraTreeRegressor", +] import numbers import warnings @@ -19,6 +23,7 @@ from sklearn.base import BaseEstimator from sklearn.base import ClassifierMixin +from sklearn.base import RegressorMixin from sklearn.base import is_classifier from sklearn.base import MultiOutputMixin from sklearn.utils import check_array @@ -44,9 +49,12 @@ DOUBLE = _tree.DOUBLE CRITERIA_CLF = {"gini": _criterion.Gini, "entropy": _criterion.Entropy} +CRITERIA_REG = {"mse": _criterion.MSE, "mae": _criterion.MAE} -DENSE_SPLITTERS = {"best": _splitter.BestSplitter, - "random": _splitter.RandomSplitter} +DENSE_SPLITTERS = { + "best": _splitter.BestSplitter, + "random": _splitter.RandomSplitter, +} # ============================================================================= # Base decision tree @@ -59,21 +67,25 @@ class BaseDecisionTree(MultiOutputMixin, BaseEstimator, metaclass=ABCMeta): Warning: This class should not be used directly. Use derived classes instead. """ + @abstractmethod @_deprecate_positional_args - def __init__(self, *, - criterion, - splitter, - max_depth, - min_samples_split, - min_samples_leaf, - min_weight_fraction_leaf, - max_features, - random_state, - min_impurity_decrease, - min_impurity_split, - class_weight=None, - presort='deprecated'): + def __init__( + self, + *, + criterion, + splitter, + max_depth, + min_samples_split, + min_samples_leaf, + min_weight_fraction_leaf, + max_features, + random_state, + min_impurity_decrease, + min_impurity_split, + class_weight=None, + presort="deprecated", + ): self.criterion = criterion self.splitter = splitter self.max_depth = max_depth @@ -125,13 +137,14 @@ def n_internals(self): check_is_fitted(self) return self.tree_.n_internals - def fit(self, X, y, sample_weight=None, check_input=True, - X_idx_sorted=None): + def fit( + self, X, y, sample_weight=None, check_input=True, X_idx_sorted=None + ): random_state = check_random_state(self.random_state) if X.dtype != np.uint8: - msg = 'The dtype of `X` should be `np.uint8`, but got {} instead.' + msg = "The dtype of `X` should be `np.uint8`, but got {} instead." raise RuntimeError(msg.format(X.dtype)) if check_input: @@ -140,9 +153,9 @@ def fit(self, X, y, sample_weight=None, check_input=True, # csr. check_X_params = dict(dtype=DTYPE, accept_sparse="csc") check_y_params = dict(ensure_2d=False, dtype=None) - X, y = self._validate_data(X, y, - validate_separately=(check_X_params, - check_y_params)) + X, y = self._validate_data( + X, y, validate_separately=(check_X_params, check_y_params) + ) # Determine output settings n_samples, self.n_features_ = X.shape @@ -158,7 +171,7 @@ def fit(self, X, y, sample_weight=None, check_input=True, self.n_outputs_ = y.shape[1] # `classes_` and `n_classes_` were set by the forest. - if not hasattr(self, "classes_"): + if not hasattr(self, "classes_") and is_classifier(self): check_classification_targets(y) y = np.copy(y) @@ -170,15 +183,17 @@ def fit(self, X, y, sample_weight=None, check_input=True, y_encoded = np.zeros(y.shape, dtype=np.int) for k in range(self.n_outputs_): - classes_k, y_encoded[:, k] = np.unique(y[:, k], - return_inverse=True) + classes_k, y_encoded[:, k] = np.unique( + y[:, k], return_inverse=True + ) self.classes_.append(classes_k) self.n_classes_.append(classes_k.shape[0]) y = y_encoded if self.class_weight is not None: expanded_class_weight = compute_sample_weight( - self.class_weight, y_original) + self.class_weight, y_original + ) self.n_classes_ = np.array(self.n_classes_, dtype=np.int32) @@ -186,35 +201,42 @@ def fit(self, X, y, sample_weight=None, check_input=True, y = np.ascontiguousarray(y, dtype=DOUBLE) # Check parameters - max_depth = (np.iinfo(np.int32).max if self.max_depth is None - else self.max_depth) + max_depth = ( + np.iinfo(np.int32).max + if self.max_depth is None + else self.max_depth + ) if isinstance(self.min_samples_leaf, numbers.Integral): if not 1 <= self.min_samples_leaf: - raise ValueError("min_samples_leaf must be at least 1 " - "or in (0, 0.5], got %s" - % self.min_samples_leaf) + raise ValueError( + "min_samples_leaf must be at least 1 " + "or in (0, 0.5], got %s" % self.min_samples_leaf + ) min_samples_leaf = self.min_samples_leaf else: # float - if not 0. < self.min_samples_leaf <= 0.5: - raise ValueError("min_samples_leaf must be at least 1 " - "or in (0, 0.5], got %s" - % self.min_samples_leaf) + if not 0.0 < self.min_samples_leaf <= 0.5: + raise ValueError( + "min_samples_leaf must be at least 1 " + "or in (0, 0.5], got %s" % self.min_samples_leaf + ) min_samples_leaf = int(ceil(self.min_samples_leaf * n_samples)) if isinstance(self.min_samples_split, numbers.Integral): if not 2 <= self.min_samples_split: - raise ValueError("min_samples_split must be an integer " - "greater than 1 or a float in (0.0, 1.0]; " - "got the integer %s" - % self.min_samples_split) + raise ValueError( + "min_samples_split must be an integer " + "greater than 1 or a float in (0.0, 1.0]; " + "got the integer %s" % self.min_samples_split + ) min_samples_split = self.min_samples_split else: # float - if not 0. < self.min_samples_split <= 1.: - raise ValueError("min_samples_split must be an integer " - "greater than 1 or a float in (0.0, 1.0]; " - "got the float %s" - % self.min_samples_split) + if not 0.0 < self.min_samples_split <= 1.0: + raise ValueError( + "min_samples_split must be an integer " + "greater than 1 or a float in (0.0, 1.0]; " + "got the float %s" % self.min_samples_split + ) min_samples_split = int(ceil(self.min_samples_split * n_samples)) min_samples_split = max(2, min_samples_split) @@ -226,25 +248,30 @@ def fit(self, X, y, sample_weight=None, check_input=True, elif self.max_features == "log2": max_features = max(1, int(np.log2(self.n_features_))) else: - raise ValueError("Invalid value for max_features. " - "Allowed string values are 'auto', " - "'sqrt' or 'log2'.") + raise ValueError( + "Invalid value for max_features. " + "Allowed string values are 'auto', " + "'sqrt' or 'log2'." + ) elif self.max_features is None: max_features = self.n_features_ elif isinstance(self.max_features, numbers.Integral): max_features = self.max_features else: # float if self.max_features > 0.0: - max_features = max(1, - int(self.max_features * self.n_features_)) + max_features = max( + 1, int(self.max_features * self.n_features_) + ) else: max_features = 0 self.max_features_ = max_features if len(y) != n_samples: - raise ValueError("Number of labels=%d does not match " - "number of samples=%d" % (len(y), n_samples)) + raise ValueError( + "Number of labels=%d does not match " + "number of samples=%d" % (len(y), n_samples) + ) if not 0 <= self.min_weight_fraction_leaf <= 0.5: raise ValueError("min_weight_fraction_leaf must in [0, 0.5]") if max_depth <= 0: @@ -263,61 +290,88 @@ def fit(self, X, y, sample_weight=None, check_input=True, # Set min_weight_leaf from min_weight_fraction_leaf if sample_weight is None: - min_weight_leaf = (self.min_weight_fraction_leaf * - n_samples) + min_weight_leaf = self.min_weight_fraction_leaf * n_samples else: - min_weight_leaf = (self.min_weight_fraction_leaf * - np.sum(sample_weight)) + min_weight_leaf = self.min_weight_fraction_leaf * np.sum( + sample_weight + ) min_impurity_split = self.min_impurity_split if min_impurity_split is not None: - warnings.warn("The min_impurity_split parameter is deprecated. " - "Its default value has changed from 1e-7 to 0 in " - "version 0.23, and it will be removed in 0.25. " - "Use the min_impurity_decrease parameter instead.", - FutureWarning) - - if min_impurity_split < 0.: - raise ValueError("min_impurity_split must be greater than " - "or equal to 0") + warnings.warn( + "The min_impurity_split parameter is deprecated. " + "Its default value has changed from 1e-7 to 0 in " + "version 0.23, and it will be removed in 0.25. " + "Use the min_impurity_decrease parameter instead.", + FutureWarning, + ) + + if min_impurity_split < 0.0: + raise ValueError( + "min_impurity_split must be greater than " "or equal to 0" + ) else: min_impurity_split = 0 - if self.min_impurity_decrease < 0.: - raise ValueError("min_impurity_decrease must be greater than " - "or equal to 0") + if self.min_impurity_decrease < 0.0: + raise ValueError( + "min_impurity_decrease must be greater than " "or equal to 0" + ) - if self.presort != 'deprecated': - warnings.warn("The parameter 'presort' is deprecated and has no " - "effect. It will be removed in v0.24. You can " - "suppress this warning by not passing any value " - "to the 'presort' parameter.", - FutureWarning) + if self.presort != "deprecated": + warnings.warn( + "The parameter 'presort' is deprecated and has no " + "effect. It will be removed in v0.24. You can " + "suppress this warning by not passing any value " + "to the 'presort' parameter.", + FutureWarning, + ) # Build tree criterion = self.criterion if not isinstance(criterion, Criterion): - criterion = CRITERIA_CLF[self.criterion](self.n_outputs_, - self.n_classes_) + if is_classifier(self): + criterion = CRITERIA_CLF[self.criterion]( + self.n_outputs_, self.n_classes_ + ) + else: + criterion = CRITERIA_REG[self.criterion]( + self.n_outputs_, n_samples + ) SPLITTERS = DENSE_SPLITTERS splitter = self.splitter if not isinstance(self.splitter, Splitter): - splitter = SPLITTERS[self.splitter](criterion, - self.max_features_, - min_samples_leaf, - min_weight_leaf, - random_state) - - self.tree_ = Tree(self.n_features_, self.n_classes_, self.n_outputs_) - - builder = DepthFirstTreeBuilder(splitter, min_samples_split, - min_samples_leaf, - min_weight_leaf, - max_depth, - self.min_impurity_decrease, - min_impurity_split) + splitter = SPLITTERS[self.splitter]( + criterion, + self.max_features_, + min_samples_leaf, + min_weight_leaf, + random_state, + ) + + if is_classifier(self): + self.tree_ = Tree( + self.n_features_, self.n_classes_, self.n_outputs_ + ) + else: + self.tree_ = Tree( + self.n_features_, + # TODO: tree should't need this in this case + np.array([1] * self.n_outputs_, dtype=np.int32), + self.n_outputs_, + ) + + builder = DepthFirstTreeBuilder( + splitter, + min_samples_split, + min_samples_leaf, + min_weight_leaf, + max_depth, + self.min_impurity_decrease, + min_impurity_split, + ) builder.build(self.tree_, X, y, sample_weight, X_idx_sorted) @@ -328,8 +382,9 @@ def fit(self, X, y, sample_weight=None, check_input=True, # Only return the essential data for using a tree for prediction feature = self.tree_.feature threshold = self.tree_.threshold - children = np.vstack((self.tree_.children_left, - self.tree_.children_right)).T + children = np.vstack( + (self.tree_.children_left, self.tree_.children_right) + ).T value = self.tree_.value return feature, threshold, children, value @@ -341,10 +396,11 @@ def _validate_X_predict(self, X, check_input): n_features = X.shape[1] if self.n_features_ != n_features: - raise ValueError("Number of features of the model must " - "match the input. Model n_features is %s and " - "input n_features is %s " - % (self.n_features_, n_features)) + raise ValueError( + "Number of features of the model must " + "match the input. Model n_features is %s and " + "input n_features is %s " % (self.n_features_, n_features) + ) return X @@ -373,27 +429,34 @@ def predict(self, X, check_input=True): """ check_is_fitted(self) X = self._validate_X_predict(X, check_input) - proba = self.tree_.predict(X) + pred = self.tree_.predict(X) - return self.classes_.take(np.argmax(proba, axis=1), axis=0) + # Classification + if is_classifier(self): + return self.classes_.take(np.argmax(pred, axis=1), axis=0) + # Regression + else: + return np.squeeze(pred) class DecisionTreeClassifier(ClassifierMixin, BaseDecisionTree): - @_deprecate_positional_args - def __init__(self, *, - criterion="gini", - splitter="best", - max_depth=None, - min_samples_split=2, - min_samples_leaf=1, - min_weight_fraction_leaf=0., - max_features=None, - random_state=None, - min_impurity_decrease=0., - min_impurity_split=None, - class_weight=None, - presort='deprecated'): + def __init__( + self, + *, + criterion="gini", + splitter="best", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features=None, + random_state=None, + min_impurity_decrease=0.0, + min_impurity_split=None, + class_weight=None, + presort="deprecated", + ): super().__init__( criterion=criterion, @@ -407,16 +470,20 @@ def __init__(self, *, random_state=random_state, min_impurity_decrease=min_impurity_decrease, min_impurity_split=min_impurity_split, - presort=presort) + presort=presort, + ) - def fit(self, X, y, sample_weight=None, check_input=True, - X_idx_sorted=None): + def fit( + self, X, y, sample_weight=None, check_input=True, X_idx_sorted=None + ): return super().fit( - X, y, + X, + y, sample_weight=sample_weight, check_input=check_input, - X_idx_sorted=X_idx_sorted) + X_idx_sorted=X_idx_sorted, + ) def predict_proba(self, X, check_input=True): @@ -424,7 +491,7 @@ def predict_proba(self, X, check_input=True): X = self._validate_X_predict(X, check_input) proba = self.tree_.predict(X) - proba = proba[:, :self.n_classes_] + proba = proba[:, : self.n_classes_] normalizer = proba.sum(axis=1)[:, np.newaxis] normalizer[normalizer == 0.0] = 1.0 proba /= normalizer @@ -432,21 +499,66 @@ def predict_proba(self, X, check_input=True): return proba -class ExtraTreeClassifier(DecisionTreeClassifier): +class DecisionTreeRegressor(RegressorMixin, BaseDecisionTree): + @_deprecate_positional_args + def __init__( + self, + *, + criterion="mse", + splitter="best", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features=None, + random_state=None, + min_impurity_decrease=0.0, + min_impurity_split=None, + presort="deprecated", + ): + super().__init__( + criterion=criterion, + splitter=splitter, + max_depth=max_depth, + min_samples_split=min_samples_split, + min_samples_leaf=min_samples_leaf, + min_weight_fraction_leaf=min_weight_fraction_leaf, + max_features=max_features, + min_impurity_decrease=min_impurity_decrease, + min_impurity_split=min_impurity_split, + random_state=random_state, + ) + + def fit( + self, X, y, sample_weight=None, check_input=True, X_idx_sorted=None + ): + return super().fit( + X, + y, + sample_weight=sample_weight, + check_input=check_input, + X_idx_sorted=X_idx_sorted, + ) + + +class ExtraTreeClassifier(DecisionTreeClassifier): @_deprecate_positional_args - def __init__(self, *, - criterion="gini", - splitter="random", - max_depth=None, - min_samples_split=2, - min_samples_leaf=1, - min_weight_fraction_leaf=0., - max_features="auto", - random_state=None, - min_impurity_decrease=0., - min_impurity_split=None, - class_weight=None): + def __init__( + self, + *, + criterion="gini", + splitter="random", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features="auto", + random_state=None, + min_impurity_decrease=0.0, + min_impurity_split=None, + class_weight=None, + ): super().__init__( criterion=criterion, @@ -459,4 +571,36 @@ def __init__(self, *, class_weight=class_weight, min_impurity_decrease=min_impurity_decrease, min_impurity_split=min_impurity_split, - random_state=random_state) + random_state=random_state, + ) + + +class ExtraTreeRegressor(DecisionTreeRegressor): + @_deprecate_positional_args + def __init__( + self, + *, + criterion="mse", + splitter="random", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features=None, + random_state=None, + min_impurity_decrease=0.0, + min_impurity_split=None, + ): + + super().__init__( + criterion=criterion, + splitter=splitter, + max_depth=max_depth, + min_samples_split=min_samples_split, + min_samples_leaf=min_samples_leaf, + min_weight_fraction_leaf=min_weight_fraction_leaf, + max_features=max_features, + min_impurity_decrease=min_impurity_decrease, + min_impurity_split=min_impurity_split, + random_state=random_state, + ) diff --git a/deepforest/utils/__init__.py b/deepforest/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/deepforest/utils/kfoldwrapper.py b/deepforest/utils/kfoldwrapper.py new file mode 100644 index 0000000..61b47ff --- /dev/null +++ b/deepforest/utils/kfoldwrapper.py @@ -0,0 +1,104 @@ +""" +Implementation of the estimator wrapper to support customized base estimators. +""" + + +__all__ = ["KFoldWrapper"] + +import copy +import numpy as np +from sklearn.model_selection import KFold + +from .. import _utils + + +class KFoldWrapper(object): + """ + A general wrapper for base estimators without the characteristic of + out-of-bag (OOB) estimation. + """ + + def __init__( + self, + estimator, + n_splits, + n_outputs, + random_state=None, + verbose=1, + is_classifier=True, + ): + + # Parameters were already validated by upstream methods + self.dummy_estimator_ = estimator + self.n_splits = n_splits + self.n_outputs = n_outputs + self.random_state = random_state + self.verbose = verbose + self.is_classifier = is_classifier + # Internal container + self.estimators_ = [] + + @property + def estimator_(self): + """Return the list of internal estimators.""" + return self.estimators_ + + def fit_transform(self, X, y, sample_weight=None): + n_samples, _ = X.shape + splitter = KFold( + n_splits=self.n_splits, + shuffle=True, + random_state=self.random_state, + ) + self.oob_decision_function_ = np.zeros((n_samples, self.n_outputs)) + + for k, (train_idx, val_idx) in enumerate(splitter.split(X, y)): + estimator = copy.deepcopy(self.dummy_estimator_) + + if self.verbose > 1: + msg = "{} - - Fitting the base estimator with fold = {}" + print(msg.format(_utils.ctime(), k)) + + # Fit on training samples + if sample_weight is None: + # Notice that a bunch of base estimators do not take + # `sample_weight` as a valid input. + estimator.fit(X[train_idx], y[train_idx]) + else: + estimator.fit( + X[train_idx], y[train_idx], sample_weight[train_idx] + ) + + # Predict on hold-out samples + if self.is_classifier: + self.oob_decision_function_[ + val_idx + ] += estimator.predict_proba(X[val_idx]) + else: + val_pred = estimator.predict(X[val_idx]) + + # Reshape for univariate regression + if self.n_outputs == 1 and len(val_pred.shape) == 1: + val_pred = np.expand_dims(val_pred, 1) + self.oob_decision_function_[val_idx] += val_pred + + # Store the estimator + self.estimators_.append(estimator) + + return self.oob_decision_function_ + + def predict(self, X): + n_samples, _ = X.shape + out = np.zeros((n_samples, self.n_outputs)) # pre-allocate results + for estimator in self.estimators_: + if self.is_classifier: + out += estimator.predict_proba(X) # classification + else: + if self.n_outputs > 1: + out += estimator.predict(X) # multi-variate regression + else: + out += estimator.predict(X).reshape( + n_samples, -1 + ) # univariate regression + + return out / self.n_splits # return the average prediction diff --git a/docs/advanced_topics/architecture.png b/docs/advanced_topics/architecture.png new file mode 100644 index 0000000..eb904bf Binary files /dev/null and b/docs/advanced_topics/architecture.png differ diff --git a/docs/advanced_topics/architecture.rst b/docs/advanced_topics/architecture.rst new file mode 100644 index 0000000..191e065 --- /dev/null +++ b/docs/advanced_topics/architecture.rst @@ -0,0 +1,53 @@ +Model Architecture +================== + +This page introduces the model architecture, training stage, and evaluating stage of DF21. You may find this page helpful on understanding the meanings of different parameters listed in `API Reference <../api_reference.html>`__. + +.. image:: ./architecture.png + :align: center + :width: 800 + +Component +~~~~~~~~~ + +This section presents the meanings of key components in DF21, along with associated parameters. + +* :class:`Binner`: The class used to reduce the number of splitting candidates for building decision trees. + + * :obj:`n_bins`, :obj:`bin_subsample`, :obj:`bin_type` + +* :class:`Estimator`: Base estimators used in cascades layer of DF21. Default estimators are RandomForestClassifier and ExtraTreesClassifier. + + * :obj:`n_trees`, :obj:`max_depth`, :obj:`min_samples_split`, :obj:`min_samples_leaf`, :obj:`criterion`, :obj:`backend` + +* :class:`Layer`: The cascade layer of DF21, which consists of multiple estimators. + + * :obj:`max_layers`, :obj:`n_estimators` + +* :class:`Predictor`: The optional predictor concatenated to the DF21 model. + + * :obj:`use_predictor`, :obj:`predictor`, :obj:`predictor_kwargs` + +Training +~~~~~~~~ + +The training stage of DF21 starts with discretizing feature-wise values of training samples into ``n_bins`` unique values, which is a commonly-used technique on accelerating building decision trees. After then, the first cascade layer in DF21 with ``n_estimators`` estimators is produced using the binned data (Notice that by default ``n_estimators`` would be multiplied by 2 internally). Furthermore, each estimator consists of ``n_trees`` decision trees that adopt the splitting criterion ``criterion``, satisfying the constraints enforced by ``max_depth`` and ``min_samples_leaf``. + +After data binning and building the first cascade layer, DF21 enters the main training loop: + +#. Bin the out-of-bag predictions of the previous cascade layer (denoted by augmented features in the figure above) using a newly-fitted :obj:`binner`; + +#. Concatenate the augmented features to the binned training samples, serving as the new training data for the cascade layer to be built; + +#. Build a new :obj:`layer` using the concatenated training data, following the same training protocols as that used to build the first cascade layer; + +#. Get the out-of-bag predictions of the :obj:`layer` and estimate its generalization performance via out-of-bag estimation; + +#. If the estimated performance is better than all previously-built layers, DF21 continues to build a new layer. Otherwise, the early-stopping procedure is triggered, and DF21 will terminate the training stage before reaching ``max_layers`` if the performance does not improve for ``n_tolerant_rounds`` rounds. + +As an optional step, DF21 builds another predictor if ``use_predictor`` is set to ``True``. This predictor takes the input the concatenated training data from the last cascade layer, and outputs the predicted class probabilities for classification problems, and predicted values for regression problems. One can use predictors like random forest or GBDT through setting ``predictor``. Besides, you can better configure it through setting ``predictor_kwargs``. + +Evaluating +~~~~~~~~~~ + +The evaluating stage follows the sequential structure of DF21. First, the testing samples are binned using the first :obj:`binner` and passed into the first :obj:`layer`. After then, DF21 sets the augmented features as the output of the current cascade layer, and bins it using the subsequent :obj:`binner`. After concatenating augmented features to the binned testing samples, DF21 moves to the next layer, util reaching the last cascade layer or the predictor. \ No newline at end of file diff --git a/docs/advanced_topics/use_customized_estimator.rst b/docs/advanced_topics/use_customized_estimator.rst new file mode 100644 index 0000000..851f988 --- /dev/null +++ b/docs/advanced_topics/use_customized_estimator.rst @@ -0,0 +1,76 @@ +Use Customized Estimators +========================= + +The version v0.1.4 of :mod:`deepforest` has added the support on: + +- using customized base estimators in cascade layers of deep forest +- using the customized predictor concatenated to the deep forest + +The page gives a detailed introduction on how to use this new feature. + +Instantiate the deep forest model +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +To begin with, you need to instantiate a deep forest model. Notice that some parameters specified here will be overridden by downstream steps. For example, if the parameter :obj:`use_predictor` is set to ``False`` here, whereas :meth:`set_predictor` is called latter, then the internal attribute :obj:`use_predictor` will be altered to ``True``. + +.. code-block:: python + + from deepforest import CascadeForestClassifier + model = CascadeForestClassifier() + +Instantiate your estimators +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In order to use customized estimators in the cascade layer of deep forest, the next step is to instantiate the estimators and encapsulate them into a Python list: + +.. code-block:: python + + n_estimators = 4 # the number of base estimators per cascade layer + estimators = [your_estimator(random_state=i) for i in range(n_estimators)] + +.. tip:: + + You need to make sure that instantiated estimators in the list are with different random seeds if seeds are manually specified. Otherwise, they will have the same behavior on the dataset and make cascade layers less effective. + +For the customized predictor, you only need to instantiate it, and there is no extra step: + +.. code-block:: python + + predictor = your_predictor() + +Deep forest will conduct internal checks to make sure that :obj:`estimators` and :obj:`predictor` are valid for training and evaluating. To pass the internal checks, the class of your customized estimators or predictor should at least implement methods listed below: + +* :meth:`fit` for training +* **[Classification]** :meth:`predict_proba` for evaluating +* **[Regression]** :meth:`predict` for evaluating + +The name of these methods follow the convention in scikit-learn, and they are already implemented in a lot of packages offering scikit-learn APIs (e.g., `XGBoost `__, `LightGBM `__, `CatBoost `__). Otherwise, you have to implement a wrapper on your customized estimators to make these methods callable. + +Call :meth:`set_estimator` and :meth:`set_predictor` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The core step is to call :meth:`set_estimator` and :meth:`set_predictor` to override estimators used by default: + +.. code-block:: python + + # Customized base estimators + model.set_estimator(estimators) + + # Customized predictor + model.set_predictor(predictor) + +:meth:`set_estimator` has another parameter :obj:`n_splits`, which determines the number of folds of the internal cross-validation strategy. Its value should be at least ``2``, and the default value is ``5``. Generally speaking, a larger :obj:`n_splits` leads to better generalization performance. If you are confused about the effect of cross-validation here, please refer to `the original paper `__ for details on how deep forest adopts the cross-validation strategy to build cascade layers. + +Train and Evaluate +~~~~~~~~~~~~~~~~~~ + +Remaining steps follow the original workflow of deep forest. + +.. code-block:: python + + model.train(X_train, y_train) + y_pred = model.predict(X_test) + +.. note:: + + When using customized estimators via :meth:`set_estimator`, deep forest adopts the cross-validation strategy to grow cascade layers. Suppose that :obj:`n_splits` is set to ``5`` when calling :meth:`set_estimator`, each estimator will be repeatedly trained over five times to get full augmented features from a cascade layer. As a result, you may experience a drastic increase in running time and memory. diff --git a/docs/api_reference.rst b/docs/api_reference.rst index 6c9511d..f371364 100644 --- a/docs/api_reference.rst +++ b/docs/api_reference.rst @@ -1,11 +1,55 @@ API Reference ============= -Below is the class and function reference for :mod:`deepforest`. Notice that the package is under active development, and some features may not be stable yet. +Below is the class and function reference for :mod:`deepforest`. Notice that the package is still under active development, and some features may not be stable yet. + +.. currentmodule:: deepforest.CascadeForestClassifier + +CascadeForestClassifier +----------------------- + +.. autosummary:: + + fit + predict_proba + predict + clean + get_estimator + get_layer_feature_importances + load + save + set_estimator + set_predictor .. autoclass:: deepforest.CascadeForestClassifier :members: :inherited-members: :show-inheritance: :no-undoc-members: + :exclude-members: set_params, get_params, score + :member-order: bysource + +.. currentmodule:: deepforest.CascadeForestRegressor + +CascadeForestRegressor +----------------------- + +.. autosummary:: + + fit + predict + clean + get_estimator + get_layer_feature_importances + load + save + set_estimator + set_predictor + +.. autoclass:: deepforest.CascadeForestRegressor + :members: + :inherited-members: + :show-inheritance: + :no-undoc-members: + :exclude-members: set_params, get_params, score :member-order: bysource diff --git a/docs/conf.py b/docs/conf.py index 93dec78..4e9ff5d 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -13,7 +13,13 @@ import os import sys -sys.path.insert(0, os.path.abspath('..')) +import deepforest + + +# -- Path setup -------------------------------------------------------------- +ON_READTHEDOCS = os.environ.get("READTHEDOCS") == "True" +if not ON_READTHEDOCS: + sys.path.insert(0, os.path.abspath("..")) # -- Project information ----------------------------------------------------- @@ -39,9 +45,14 @@ 'sphinx.ext.todo', 'sphinx.ext.napoleon', 'sphinx_panels', - 'sphinx_copybutton' + 'sphinx_copybutton', + "m2r2" ] +autosummary_generate = True + +source_suffix = ['.rst', '.md'] + autoapi_dirs = ['../deepforest'] autodoc_member_order = 'bysource' diff --git a/docs/contributors.rst b/docs/contributors.rst new file mode 100644 index 0000000..ed3bf36 --- /dev/null +++ b/docs/contributors.rst @@ -0,0 +1 @@ +.. mdinclude:: ../CONTRIBUTORS.md \ No newline at end of file diff --git a/docs/experiments.rst b/docs/experiments.rst index 79ba98a..66d83d8 100644 --- a/docs/experiments.rst +++ b/docs/experiments.rst @@ -2,7 +2,7 @@ Experiments =========== Baseline -******** +-------- For all experiments, we used 5 popular tree-based ensemble methods as baselines. Details on the baselines are listed in the following table: +------------------+---------------------------------------------------------------+ @@ -20,7 +20,7 @@ For all experiments, we used 5 popular tree-based ensemble methods as baselines. +------------------+---------------------------------------------------------------+ Environment -*********** +----------- For all experiments, we used a single linux server. Details on the specifications are listed in the table below. All processors were used for training and evaluating. +------------------+-----------------+--------+ @@ -30,9 +30,12 @@ For all experiments, we used a single linux server. Details on the specification +------------------+-----------------+--------+ Setting -******* +------- We kept the number of decision trees the same across all baselines, while remaining hyper-parameters were set to their default values. Running scripts on reproducing all experiment results are available, please refer to this `Repo`_. +Classification +-------------- + Dataset ******* @@ -137,6 +140,66 @@ Some observations are listed as follow: * Histogram-based GBDT (e.g., :class:`HGBDT`, :class:`XGB HIST`, :class:`LightGBM`) are typically faster mainly because decision tree in GBDT tends to have a much smaller tree depth; * With the number of input dimensions increasing (e.g., on mnist and fashion-mnist), random forest and deep forest can be faster. +Regression +---------- + +Dataset +******* + +We have also collected four datasets on univariate regression for a comparison on the regression problem. + ++------------------+------------+-----------+------------+ +| Name | # Training | # Testing | # Features | ++==================+============+===========+============+ +| `wine`_ | 1,071 | 528 | 11 | ++------------------+------------+-----------+------------+ +| `abalone`_ | 2,799 | 1,378 | 8 | ++------------------+------------+-----------+------------+ +| `cpusmall`_ | 5,489 | 2,703 | 12 | ++------------------+------------+-----------+------------+ +| `boston`_ | 379 | 127 | 13 | ++------------------+------------+-----------+------------+ +| `diabetes`_ | 303 | 139 | 10 | ++------------------+------------+-----------+------------+ + +Testing Mean Squared Error +************************** + +The table below shows the testing mean squared error of each method, with the best result on each dataset **bolded**. Each experiment was conducted over 5 independently trials, and the average result was reported. + ++----------+-----------+---------+-----------+----------+----------+-------------+ +| Name | RF | HGBDT | XGB EXACT | XGB HIST | LightGBM | Deep Forest | ++==========+===========+=========+===========+==========+==========+=============+ +| wine | 0.35 | 0.40 | 0.41 | 0.41 | 0.39 | **0.34** | ++----------+-----------+---------+-----------+----------+----------+-------------+ +| abalone | 4.79 | 5.40 | 5.73 | 5.75 | 5.60 | **4.66** | ++----------+-----------+---------+-----------+----------+----------+-------------+ +| cpusmall | 8.31 | 9.01 | 9.86 | 11.82 | 8.99 | **7.15** | ++----------+-----------+---------+-----------+----------+----------+-------------+ +| boston | **16.61** | 20.68 | 20.61 | 19.65 | 20.27 | 19.87 | ++----------+-----------+---------+-----------+----------+----------+-------------+ +| diabetes | 3796.62 | 4333.66 | 4337.15 | 4303.96 | 4435.95 | **3431.01** | ++----------+-----------+---------+-----------+----------+----------+-------------+ + +Runtime +******* + +Runtime in seconds reported in the table below covers both the training stage and evaluating stage. + ++----------+------+-------+-----------+----------+----------+-------------+ +| Name | RF | HGBDT | XGB EXACT | XGB HIST | LightGBM | Deep Forest | ++==========+======+=======+===========+==========+==========+=============+ +| wine | 0.76 | 2.88 | 0.30 | 0.30 | 0.30 | 1.26 | ++----------+------+-------+-----------+----------+----------+-------------+ +| abalone | 0.53 | 1.57 | 0.47 | 0.50 | 0.17 | 1.29 | ++----------+------+-------+-----------+----------+----------+-------------+ +| cpusmall | 1.87 | 3.59 | 1.71 | 1.25 | 0.36 | 2.06 | ++----------+------+-------+-----------+----------+----------+-------------+ +| boston | 0.70 | 1.75 | 0.19 | 0.22 | 0.20 | 1.45 | ++----------+------+-------+-----------+----------+----------+-------------+ +| diabetes | 0.37 | 0.66 | 0.14 | 0.18 | 0.06 | 1.09 | ++----------+------+-------+-----------+----------+----------+-------------+ + .. _`Random Forest`: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html .. _`HGBDT`: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.HistGradientBoostingClassifier.html @@ -170,3 +233,13 @@ Some observations are listed as follow: .. _`mnist`: https://keras.io/api/datasets/mnist/ .. _`fashion mnist`: https://keras.io/api/datasets/fashion_mnist/ + +.. _`wine`: https://www.kaggle.com/uciml/red-wine-quality-cortez-et-al-2009 + +.. _`abalone`: https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression.html#abalone + +.. _`cpusmall`: https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/regression.html#cpusmall + +.. _`boston`: https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_boston.html + +.. _`diabetes`: https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_diabetes.html diff --git a/docs/index.rst b/docs/index.rst index 23965be..0d91af5 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,14 +1,14 @@ DF21 Documentation ================== -**DF21** is an implementation of Deep Forest 2021.2.1. It is designed to have the following advantages: +**DF21** is an implementation of `Deep Forest `__ 2021.2.1. It is designed to have the following advantages: - **Powerful**: Better accuracy than existing tree-based ensemble methods. - **Easy to Use**: Less efforts on tunning parameters. - **Efficient**: Fast training speed and high efficiency. - **Scalable**: Capable of handling large-scale data. -Whenever one used tree-based machine learning approaches such as Random Forest or GBDT, DF21 may offer a new powerful option. This package is actively being developed, and any help would be welcomed. Please check the homepage on `Gitee `__ or `Github `__ for details. +DF21 offers an effective & powerful option to the tree-based machine learning algorithms such as Random Forest or GBDT. This package is actively being developed, and any help would be welcomed. Please check the homepage on `Gitee `__ or `Github `__ for details. Guidepost --------- @@ -19,7 +19,7 @@ Guidepost Installation ------------ -The package is available via `PyPI `__ using: +DF21 can be installed using pip via `PyPI `__ which is the package installer for Python. You can use pip to install packages from the Python Package Index and other indexes. Refer `this `__ for the documentation of pip. Use this command to download DF21 : .. code-block:: bash @@ -28,37 +28,52 @@ The package is available via `PyPI `__ us Quickstart ---------- -.. code-block:: python +Classification +************** - from deepforest import CascadeForestClassifier +.. code-block:: python - # Load utils from sklearn.datasets import load_digits from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score - # Load data + from deepforest import CascadeForestClassifier + X, y = load_digits(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) - model = CascadeForestClassifier(random_state=1) - - # Train model.fit(X_train, y_train) - - # Evaluate y_pred = model.predict(X_test) acc = accuracy_score(y_test, y_pred) * 100 - - print("Testing Accuracy: {:.3f} %".format(acc)) + print("\nTesting Accuracy: {:.3f} %".format(acc)) >>> Testing Accuracy: 98.667 % +Regression +********** + +.. code-block:: python + + from sklearn.datasets import load_boston + from sklearn.model_selection import train_test_split + from sklearn.metrics import mean_squared_error + + from deepforest import CascadeForestRegressor + + X, y = load_boston(return_X_y=True) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) + model = CascadeForestRegressor(random_state=1) + model.fit(X_train, y_train) + y_pred = model.predict(X_test) + mse = mean_squared_error(y_test, y_pred) + print("\nTesting MSE: {:.3f}".format(mse)) + >>> Testing MSE: 8.068 + Resources --------- * Deep Forest: `[Paper] `__ * Keynote at AISTATS 2019: `[Slides] `__ -* Source Code: `[Gitee] `__ | `[GitHub] `__ +* Source Code: `[GitHub] `__ | `[Gitee] `__ Reference --------- @@ -90,11 +105,20 @@ Reference API Reference Parameters Tunning Experiments + Report from Users + +.. toctree:: + :maxdepth: 1 + :caption: Advanced Topics + + Model Architecture <./advanced_topics/architecture> + Use Customized Estimators <./advanced_topics/use_customized_estimator> .. toctree:: :maxdepth: 1 :caption: For Developers + Contributors Changelog .. toctree:: @@ -103,8 +127,3 @@ Reference About Us Related Software - -Acknowledgement ---------------- - -The lead developer and maintainer of DF21 is Mr. `Yi-Xuan Xu `__. Before the release, it has been used internally in the LAMDA Group, Nanjing University, China. diff --git a/docs/installation_guide.rst b/docs/installation_guide.rst index 02d1769..1a2659f 100644 --- a/docs/installation_guide.rst +++ b/docs/installation_guide.rst @@ -12,7 +12,7 @@ The stable version is available via `PyPI `__ using: The package is portable and with very few package dependencies. It is recommended to use the package environment from `Anaconda `__ since it already installs all required packages. -Notice that only the 64-bit Linux and Windows platform are officially supported. To use deep forest on Mac-OS or other platforms, you will need to build the entire package from source. +Notice that only the 64-bit Linux, Windows, and Mac-OS platform are officially supported. To use deep forest on other platforms, you will need to build the entire package from source. Building from Source -------------------- @@ -52,10 +52,6 @@ Building from source is required to work on a contribution (bug fix, new feature $ cd tests % pytest -.. warning:: - - **[Jan 31, 2021]** The Numpy developers have released the version 1.20.0 of `Numpy `__, which makes many changes on the C-APIs, and can be incompatible with those used in the package. You are at your own risks to build the package from source with the version 1.20.0 of Numpy installed. - Acknowledgement --------------- diff --git a/docs/report_from_users.rst b/docs/report_from_users.rst new file mode 100644 index 0000000..6afb697 --- /dev/null +++ b/docs/report_from_users.rst @@ -0,0 +1,12 @@ +Report from Users +================= + +The page collects user reports on using deep forest. Thanks all of them for their nice work! + +Competition +----------- + +* 1st winning solution of the competition `Insurance-Pricing-Game@AIcrowd `__: `[Solution] `__ | `[Presentation] `__ + +Application +----------- \ No newline at end of file diff --git a/docs/requirements.txt b/docs/requirements.txt index b6aecca..803df12 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,4 +1,7 @@ sphinx==3.1.2 sphinx_rtd_theme==0.5.0 sphinx-panels==0.5.* -sphinx-copybutton \ No newline at end of file +sphinx-copybutton +m2r2==0.2.7 +jinja2<3.1.0 +mistune==0.8.4 \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 1e8ebdd..e72d738 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,8 +1,25 @@ [build-system] requires = [ - "setuptools>=42", + "setuptools<60.0", "wheel", "Cython>=0.28.5", - "numpy>=1.13.3,<1.20.0", - "scipy>=0.19.1" -] \ No newline at end of file + "oldest-supported-numpy", + "scipy>=1.3.2", +] +[tool.black] +line-length = 79 +include = '\.pyi?$' +exclude = ''' +/( + \.git + | \.hg + | \.mypy_cache + | \.tox + | \.venv + | _build + | buck-out + | build + | dist + | docs +)/ +''' diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..e58b921 --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,3 @@ +-r requirements.txt +pytest +pre-commit diff --git a/requirements.txt b/requirements.txt index c292f16..749d1ee 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -numpy>=1.13.3,<1.20.0 -scipy>=0.19.1 +numpy>=1.14.6 +scipy>=1.1.0 joblib>=0.11 -scikit-learn>=0.22 \ No newline at end of file +scikit-learn>=1.0 \ No newline at end of file diff --git a/setup.py b/setup.py index da7fa0e..90158c6 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,5 @@ import os import sys -import numpy -import setuptools from setuptools import find_packages from numpy.distutils.core import setup @@ -14,7 +12,7 @@ MAINTAINER = "Yi-Xuan Xu" MAINTAINER_EMAIL = "xuyx@lamda.nju.edu.cn" URL = "https://github.com/LAMDA-NJU/Deep-Forest" -VERSION = "0.1.0" +VERSION = "0.1.7" def configuration(parent_package="", top_path=None): @@ -38,36 +36,36 @@ def configuration(parent_package="", top_path=None): os.chdir(local_path) sys.path.insert(0, local_path) - setup(configuration=configuration, - name=DISTNAME, - maintainer=MAINTAINER, - maintainer_email=MAINTAINER_EMAIL, - packages=find_packages(), - include_package_data=True, - description=DESCRIPTION, - url=URL, - version=VERSION, - long_description=LONG_DESCRIPTION, - zip_safe=False, - classifiers=[ - "Intended Audience :: Science/Research", - "Intended Audience :: Developers", - "Programming Language :: C", - "Programming Language :: Python", - "Topic :: Software Development", - "Topic :: Scientific/Engineering", - "Operating System :: Microsoft :: Windows", - "Operating System :: Unix", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.6", - "Programming Language :: Python :: 3.7", - "Programming Language :: Python :: 3.8", - ], - python_requires=">=3.6", - install_requires=[ - "numpy>=1.13.3,<1.20.0", - "scipy>=0.19.1", - "joblib>=0.11", - "scikit-learn>=0.22", - ], - setup_requires=["cython"]) + setup( + configuration=configuration, + name=DISTNAME, + maintainer=MAINTAINER, + maintainer_email=MAINTAINER_EMAIL, + packages=find_packages(), + include_package_data=True, + description=DESCRIPTION, + url=URL, + version=VERSION, + long_description=LONG_DESCRIPTION, + zip_safe=False, + classifiers=[ + "Intended Audience :: Science/Research", + "Intended Audience :: Developers", + "Programming Language :: Python", + "Topic :: Software Development", + "Topic :: Scientific/Engineering", + "Operating System :: Microsoft :: Windows", + "Operating System :: Unix", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + ], + python_requires=">=3.7", + install_requires=[ + "numpy>=1.14.6", + "scipy>=1.1.0", + "joblib>=0.11", + "scikit-learn>=1.0", + ], + setup_requires=["cython"], + ) diff --git a/tests/test_binner.py b/tests/test_binner.py index cb1a0c9..398f965 100644 --- a/tests/test_binner.py +++ b/tests/test_binner.py @@ -3,16 +3,15 @@ from numpy.testing import assert_allclose import pytest -from deepforest._binner import ( - Binner, - _find_binning_thresholds_per_feature -) +from deepforest._binner import Binner, _find_binning_thresholds_per_feature -kwargs = {"n_bins": 255, - "bin_subsample": 2e5, - "bin_type": "percentile", - "random_state": 0} +kwargs = { + "n_bins": 255, + "bin_subsample": 2e5, + "bin_type": "percentile", + "random_state": 0, +} def test_find_binning_thresholds_regular_data(): @@ -26,14 +25,14 @@ def test_find_binning_thresholds_regular_data(): assert_allclose(bin_thresholds, [2, 4, 6, 8]) # Interval - bin_thresholds = _find_binning_thresholds_per_feature(data, - n_bins=10, - bin_type="interval") + bin_thresholds = _find_binning_thresholds_per_feature( + data, n_bins=10, bin_type="interval" + ) assert_allclose(bin_thresholds, [1, 2, 3, 4, 5, 6, 7, 8, 9]) - bin_thresholds = _find_binning_thresholds_per_feature(data, - n_bins=5, - bin_type="interval") + bin_thresholds = _find_binning_thresholds_per_feature( + data, n_bins=5, bin_type="interval" + ) assert_allclose(bin_thresholds, [2, 4, 6, 8]) @@ -42,9 +41,9 @@ def test_find_binning_thresholds_invalid_binner_type(): err_msg = "Unknown binning type: unknown." with pytest.raises(ValueError, match=err_msg): - _find_binning_thresholds_per_feature(data, - n_bins=10, - bin_type="unknown") + _find_binning_thresholds_per_feature( + data, n_bins=10, bin_type="unknown" + ) def test_find_binning_thresholds_invalid_data_shape(): @@ -55,10 +54,14 @@ def test_find_binning_thresholds_invalid_data_shape(): assert "Per-feature data should be of the shape" in str(execinfo.value) -@pytest.mark.parametrize('param', - [(0, {"n_bins": 1}), - (1, {"bin_subsample": 0}), - (2, {"bin_type": "unknown"})]) +@pytest.mark.parametrize( + "param", + [ + (0, {"n_bins": 1}), + (1, {"bin_subsample": 0}), + (2, {"bin_type": "unknown"}), + ], +) def test_binner_invalid_params(param): data = np.linspace(0, 10, 1001) case_kwargs = copy.deepcopy(kwargs) diff --git a/tests/test_buffer.py b/tests/test_buffer.py index f750f25..1d918f9 100644 --- a/tests/test_buffer.py +++ b/tests/test_buffer.py @@ -5,11 +5,13 @@ from deepforest import _io as io -open_buffer = io.Buffer(use_buffer=True, - buffer_dir="./", - store_est=True, - store_pred=True, - store_data=True) +open_buffer = io.Buffer( + use_buffer=True, + buffer_dir="./", + store_est=True, + store_pred=True, + store_data=True, +) close_buffer = io.Buffer(use_buffer=False) @@ -39,13 +41,19 @@ def test_store_data_open_buffer(): layer_idx = 0 ret = open_buffer.cache_data(layer_idx, X, is_training_data=True) assert isinstance(ret, np.memmap) - assert os.path.exists(os.path.join( - open_buffer.data_dir_, "joblib_train_{}.mmap".format(layer_idx))) + assert os.path.exists( + os.path.join( + open_buffer.data_dir_, "joblib_train_{}.mmap".format(layer_idx) + ) + ) ret = open_buffer.cache_data(layer_idx, X, is_training_data=False) assert isinstance(ret, np.memmap) - assert os.path.exists(os.path.join( - open_buffer.data_dir_, "joblib_test_{}.mmap".format(layer_idx))) + assert os.path.exists( + os.path.join( + open_buffer.data_dir_, "joblib_test_{}.mmap".format(layer_idx) + ) + ) def test_load_estimator_missing(): diff --git a/tests/test_forest.py b/tests/test_forest.py index 8c6eba4..0274980 100644 --- a/tests/test_forest.py +++ b/tests/test_forest.py @@ -2,17 +2,19 @@ from deepforest import RandomForestClassifier from deepforest import ExtraTreesClassifier +from deepforest import RandomForestRegressor +from deepforest import ExtraTreesRegressor from deepforest.forest import _get_n_samples_bootstrap # Load utils from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper -from sklearn.datasets import load_iris, load_wine -from sklearn.ensemble._forest import (_get_n_samples_bootstrap as - sklearn_get_n_samples_bootstrap) +from sklearn.datasets import load_iris, load_wine, load_boston +from sklearn.ensemble._forest import ( + _get_n_samples_bootstrap as sklearn_get_n_samples_bootstrap, +) -@pytest.mark.parametrize("max_samples", - [0.42, 42, None]) +@pytest.mark.parametrize("max_samples", [0.42, 42, None]) def test_n_samples_bootstrap(max_samples): n_samples = 420 actual = _get_n_samples_bootstrap(n_samples, max_samples) @@ -43,14 +45,15 @@ def test_n_samples_bootstrap_invalid_type(): n_samples = 42 max_samples = "42" - err_msg = ("`max_samples` should be int or float, but got type" - " ''") + err_msg = ( + "`max_samples` should be int or float, but got type" " ''" + ) with pytest.raises(TypeError, match=err_msg): _get_n_samples_bootstrap(n_samples, max_samples) @pytest.mark.parametrize("load_func", [load_iris, load_wine]) -def test_forest_workflow(load_func): +def test_forest_classifier_workflow(load_func): n_estimators = 100 # to avoid oob warning random_state = 42 @@ -62,15 +65,46 @@ def test_forest_workflow(load_func): X_binned = binner.fit_transform(X) # Random Forest - model = RandomForestClassifier(n_estimators=n_estimators, - random_state=random_state) + model = RandomForestClassifier( + n_estimators=n_estimators, random_state=random_state + ) model.fit(X_binned, y) model.predict(X_binned) # Extremely Random Forest - model = ExtraTreesClassifier(n_estimators=n_estimators, - random_state=random_state) + model = ExtraTreesClassifier( + n_estimators=n_estimators, random_state=random_state + ) + + model.fit(X_binned, y) + model.predict(X_binned) + + +@pytest.mark.parametrize("load_func", [load_boston]) +def test_forest_regressor_workflow(load_func): + + n_estimators = 100 # to avoid oob warning + random_state = 42 + + X, y = load_func(return_X_y=True) + + # Data binning + binner = _BinMapper(random_state=random_state) + X_binned = binner.fit_transform(X) + + # Random Forest + model = RandomForestRegressor( + n_estimators=n_estimators, random_state=random_state + ) + + model.fit(X_binned, y) + model.predict(X_binned) + + # Extremely Random Forest + model = ExtraTreesRegressor( + n_estimators=n_estimators, random_state=random_state + ) model.fit(X_binned, y) model.predict(X_binned) diff --git a/tests/test_io.py b/tests/test_io.py index 74756a8..b9179b3 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -28,8 +28,9 @@ def test_mkdir_already_exist(): def test_model_saveobj_not_exist(): - err_msg = ("Cannot find the target directory: ./tmp." - " Please create it first.") + err_msg = ( + "Cannot find the target directory: ./tmp." " Please create it first." + ) with pytest.raises(RuntimeError, match=err_msg): io.model_saveobj(save_dir, "param", None) diff --git a/tests/test_layer_estimator.py b/tests/test_layer_estimator.py index 9673955..f494fe0 100644 --- a/tests/test_layer_estimator.py +++ b/tests/test_layer_estimator.py @@ -1,11 +1,14 @@ import copy import pytest -from deepforest._layer import Layer +from deepforest._layer import ( + ClassificationCascadeLayer, + RegressionCascadeLayer, +) from deepforest._estimator import Estimator # Load utils from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper -from sklearn.datasets import load_digits +from sklearn.datasets import load_digits, load_boston from sklearn.model_selection import train_test_split @@ -19,9 +22,35 @@ ) # Parameters -layer_kwargs = { +classifier_layer_kwargs = { + "layer_idx": 0, + "n_outputs": 10, + "criterion": "gini", + "n_estimators": 1, + "n_trees": 10, + "max_depth": 3, + "min_samples_leaf": 10, + "partial_mode": False, + "buffer": None, + "n_jobs": -1, + "random_state": 42, + "verbose": 2, +} + +classifier_estimator_kwargs = { + "name": "rf", + "criterion": "gini", + "n_trees": 10, + "max_depth": 3, + "min_samples_leaf": 10, + "n_jobs": -1, + "random_state": 42, +} + +regressor_layer_kwargs = { "layer_idx": 0, - "n_classes": 10, + "n_outputs": 1, + "criterion": "mse", "n_estimators": 1, "n_trees": 10, "max_depth": 3, @@ -33,8 +62,9 @@ "verbose": 2, } -estimator_kwargs = { +regressor_estimator_kwargs = { "name": "rf", + "criterion": "mse", "n_trees": 10, "max_depth": 3, "min_samples_leaf": 10, @@ -43,20 +73,53 @@ } -def test_layer_properties_after_fitting(): +def test_classifier_layer_properties_after_fitting(): - layer = Layer(**layer_kwargs) + layer = ClassificationCascadeLayer(**classifier_layer_kwargs) X_aug = layer.fit_transform(X_train, y_train) y_pred_full = layer.predict_full(X_test) # n_trees assert ( layer.n_trees_ - == 2 * layer_kwargs["n_estimators"] * layer_kwargs["n_trees"] + == 2 + * classifier_layer_kwargs["n_estimators"] + * classifier_layer_kwargs["n_trees"] ) # Output dim - expect_dim = 2 * layer_kwargs["n_classes"] * layer_kwargs["n_estimators"] + expect_dim = ( + 2 + * classifier_layer_kwargs["n_outputs"] + * classifier_layer_kwargs["n_estimators"] + ) + assert X_aug.shape[1] == expect_dim + assert y_pred_full.shape[1] == expect_dim + + +def test_regressor_layer_properties_after_fitting(): + # Load data and binning + X, y = load_boston(return_X_y=True) + binner = _BinMapper(random_state=142) + X_binned = binner.fit_transform(X) + + X_train, X_test, y_train, y_test = train_test_split( + X_binned, y, test_size=0.42, random_state=42 + ) + layer = RegressionCascadeLayer(**regressor_layer_kwargs) + X_aug = layer.fit_transform(X_train, y_train) + y_pred_full = layer.predict_full(X_test) + + # n_trees + assert ( + layer.n_trees_ + == 2 + * regressor_layer_kwargs["n_estimators"] + * regressor_layer_kwargs["n_trees"] + ) + + # Output dim + expect_dim = 2 * regressor_layer_kwargs["n_estimators"] assert X_aug.shape[1] == expect_dim assert y_pred_full.shape[1] == expect_dim @@ -64,11 +127,14 @@ def test_layer_properties_after_fitting(): @pytest.mark.parametrize( "param", [(0, {"n_estimators": 0}), (1, {"n_trees": 0})] ) -def test_layer_invalid_training_params(param): +@pytest.mark.parametrize( + "layer_kwargs", [(classifier_layer_kwargs), (regressor_layer_kwargs)] +) +def test_layer_invalid_training_params(param, layer_kwargs): case_kwargs = copy.deepcopy(layer_kwargs) case_kwargs.update(param[1]) - layer = Layer(**case_kwargs) + layer = ClassificationCascadeLayer(**case_kwargs) if param[0] == 0: err_msg = "`n_estimators` = 0 should be strictly positive." @@ -79,7 +145,11 @@ def test_layer_invalid_training_params(param): layer.fit_transform(X_train, y_train) -def test_estimator_unknown(): +@pytest.mark.parametrize( + "estimator_kwargs", + [(classifier_estimator_kwargs), (regressor_estimator_kwargs)], +) +def test_estimator_unknown(estimator_kwargs): case_kwargs = copy.deepcopy(estimator_kwargs) case_kwargs.update({"name": "unknown"}) diff --git a/tests/test_model_classifier.py b/tests/test_model_classifier.py new file mode 100644 index 0000000..10b57f9 --- /dev/null +++ b/tests/test_model_classifier.py @@ -0,0 +1,311 @@ +import copy +import pytest +import shutil +import numpy as np +from numpy.testing import assert_array_equal, assert_raises +from sklearn.datasets import load_iris +from sklearn.model_selection import train_test_split + +import deepforest +from deepforest import CascadeForestClassifier +from deepforest.cascade import _get_predictor_kwargs + + +save_dir = "./tmp" + +# Load data +X, y = load_iris(return_X_y=True) +X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.42, random_state=42 +) + +# Parameters +toy_kwargs = { + "n_bins": 10, + "bin_subsample": 2e5, + "max_layers": 10, + "n_estimators": 1, + "criterion": "gini", + "n_trees": 100, + "max_depth": 3, + "min_samples_split": 2, + "min_samples_leaf": 1, + "use_predictor": True, + "predictor": "forest", + "predictor_kwargs": {}, + "n_tolerant_rounds": 2, + "delta": 1e-5, + "n_jobs": -1, + "random_state": 0, + "verbose": 2, +} + +kwargs = { + "n_bins": 255, + "bin_subsample": 2e5, + "max_layers": 10, + "n_estimators": 2, + "criterion": "gini", + "n_trees": 100, + "max_depth": None, + "min_samples_split": 2, + "min_samples_leaf": 1, + "use_predictor": True, + "predictor": "forest", + "predictor_kwargs": {}, + "n_tolerant_rounds": 2, + "delta": 1e-5, + "n_jobs": -1, + "random_state": 0, + "verbose": 2, +} + + +@pytest.mark.parametrize( + "test_input,expected", + [ + ( + {"predictor_kwargs": {}, "n_job": 2}, + {"n_job": 2}, + ), + ( + {"predictor_kwargs": {"n_job": 3}, "n_job": 2}, + {"n_job": 3}, + ), + ( + {"predictor_kwargs": {"iter": 4}, "n_job": 2}, + {"iter": 4, "n_job": 2}, + ), + ], +) +def test_predictor_kwargs_overwrite(test_input, expected): + assert _get_predictor_kwargs(**test_input) == expected + + +def test_model_properties_after_fitting(): + """Check the model properties after fitting a deep forest model.""" + model = CascadeForestClassifier(**toy_kwargs) + model.fit(X_train, y_train) + + assert len(model) == model.n_layers_ + + assert model[0] is model._get_layer(0) + + with pytest.raises(IndexError) as excinfo: + model._get_layer(model.n_layers_) + assert "The layer index should be in the range" in str(excinfo.value) + + with pytest.raises(RuntimeError) as excinfo: + model._set_layer(0, None) + assert "already exists in the internal container" in str(excinfo.value) + + with pytest.raises(ValueError) as excinfo: + model._get_binner(model.n_layers_ + 1) + assert "The binner index should be in the range" in str(excinfo.value) + + with pytest.raises(RuntimeError) as excinfo: + model._set_binner(0, None) + assert "already exists in the internal container" in str(excinfo.value) + + # Test the hook on forest estimator + assert ( + model.get_estimator(0, 0, "rf") + is model._get_layer(0).estimators_["0-0-rf"].estimator_ + ) + + with pytest.raises(ValueError) as excinfo: + model.get_estimator(model.n_layers_, 0, "rf") + assert "`layer_idx` should be in the range" in str(excinfo.value) + + with pytest.raises(ValueError) as excinfo: + model.get_estimator(0, model.n_estimators, "rf") + assert "`est_idx` should be in the range" in str(excinfo.value) + + with pytest.raises(ValueError) as excinfo: + model.get_estimator(0, 0, "Unknown") + assert "`estimator_type` should be one of" in str(excinfo.value) + + +@pytest.mark.parametrize("backend", ["custom", "sklearn"]) +def test_model_workflow_partial_mode(backend): + """Run the workflow of deep forest with a local buffer.""" + + case_kwargs = copy.deepcopy(kwargs) + case_kwargs.update({"partial_mode": True}) + case_kwargs.update({"backend": backend}) + + model = CascadeForestClassifier(**case_kwargs) + model.fit(X_train, y_train) + + # Test feature_importances_ + if backend == "sklearn": + model.get_layer_feature_importances(0) + else: + with pytest.raises(RuntimeError) as excinfo: + model.get_layer_feature_importances(0) + assert "Please use the sklearn backend" in str(excinfo.value) + + # Predictions before saving + y_pred_before = model.predict(X_test) + + # Save and Reload + model.save(save_dir) + + model = CascadeForestClassifier(**case_kwargs) + model.load(save_dir) + + # Predictions after loading + y_pred_after = model.predict(X_test) + + # Make sure the same predictions before and after model serialization + assert_array_equal(y_pred_before, y_pred_after) + + model.clean() # clear the buffer + shutil.rmtree(save_dir) + + +def test_model_sample_weight(): + """Run the workflow of deep forest with a local buffer.""" + + case_kwargs = copy.deepcopy(kwargs) + + # Training without sample_weight + model = CascadeForestClassifier(**case_kwargs) + model.fit(X_train, y_train) + y_pred_no_sample_weight = model.predict(X_test) + + # Training with equal sample_weight + model = CascadeForestClassifier(**case_kwargs) + sample_weight = np.ones(y_train.size) + model.fit(X_train, y_train, sample_weight=sample_weight) + y_pred_equal_sample_weight = model.predict(X_test) + + # Make sure the same predictions with None and equal sample_weight + assert_array_equal(y_pred_no_sample_weight, y_pred_equal_sample_weight) + + model = CascadeForestClassifier(**case_kwargs) + sample_weight = np.where(y_train == 0, 1, 10) + model.fit(X_train, y_train, sample_weight=sample_weight) + y_pred_skewed_sample_weight = model.predict(X_test) + + # Make sure the different predictions with None and equal sample_weight + assert_raises( + AssertionError, + assert_array_equal, + y_pred_skewed_sample_weight, + y_pred_equal_sample_weight, + ) + + model.clean() # clear the buffer + + +@pytest.mark.parametrize("backend", ["custom", "sklearn"]) +def test_model_workflow_in_memory(backend): + """Run the workflow of deep forest with in-memory mode.""" + + case_kwargs = copy.deepcopy(kwargs) + case_kwargs.update({"partial_mode": False}) + case_kwargs.update({"backend": backend}) + + model = CascadeForestClassifier(**case_kwargs) + model.fit(X_train, y_train) + + # Test feature_importances_ + if backend == "sklearn": + model.get_layer_feature_importances(0) + else: + with pytest.raises(RuntimeError) as excinfo: + model.get_layer_feature_importances(0) + assert "Please use the sklearn backend" in str(excinfo.value) + + # Predictions before saving + y_pred_before = model.predict(X_test) + + # Save and Reload + model.save(save_dir) + + model = CascadeForestClassifier(**case_kwargs) + model.load(save_dir) + + # Make sure the same predictions before and after model serialization + y_pred_after = model.predict(X_test) + + assert_array_equal(y_pred_before, y_pred_after) + + shutil.rmtree(save_dir) + + +@pytest.mark.parametrize( + "param", + [ + (0, {"max_layers": 0}), + (1, {"n_tolerant_rounds": 0}), + (2, {"delta": -1}), + (3, {"backend": "unknown"}), + ], +) +def test_model_invalid_training_params(param): + case_kwargs = copy.deepcopy(toy_kwargs) + case_kwargs.update(param[1]) + + model = CascadeForestClassifier(**case_kwargs) + + with pytest.raises(ValueError) as excinfo: + model.fit(X_train, y_train) + + if param[0] == 0: + assert "max_layers" in str(excinfo.value) + elif param[0] == 1: + assert "n_tolerant_rounds" in str(excinfo.value) + elif param[0] == 2: + assert "delta " in str(excinfo.value) + elif param[0] == 3: + assert "backend" in str(excinfo.value) + + +@pytest.mark.parametrize("predictor", ["forest", "xgboost", "lightgbm"]) +def test_classifier_predictor_normal(predictor): + deepforest.cascade._build_classifier_predictor( + predictor, criterion="gini", n_estimators=1, n_outputs=2 + ) + + +def test_classifier_predictor_unknown(): + with pytest.raises(NotImplementedError) as excinfo: + deepforest.cascade._build_classifier_predictor( + "unknown", criterion="gini", n_estimators=1, n_outputs=2 + ) + assert "name of the predictor should be one of" in str(excinfo.value) + + +def test_model_n_trees_non_positive(): + case_kwargs = copy.deepcopy(toy_kwargs) + case_kwargs.update({"n_trees": 0}) + model = CascadeForestClassifier(**case_kwargs) + with pytest.raises(ValueError) as excinfo: + model._set_n_trees(0) + assert "should be strictly positive." in str(excinfo.value) + + +def test_model_n_trees_auto(): + case_kwargs = copy.deepcopy(toy_kwargs) + case_kwargs.update({"n_trees": "auto"}) + model = CascadeForestClassifier(**case_kwargs) + + n_trees = model._set_n_trees(0) + assert n_trees == 100 + + n_trees = model._set_n_trees(2) + assert n_trees == 300 + + n_trees = model._set_n_trees(10) + assert n_trees == 500 + + +def test_model_n_trees_invalid(): + case_kwargs = copy.deepcopy(toy_kwargs) + case_kwargs.update({"n_trees": [42]}) + model = CascadeForestClassifier(**case_kwargs) + with pytest.raises(ValueError) as excinfo: + model._set_n_trees(0) + assert "Invalid value for n_trees." in str(excinfo.value) diff --git a/tests/test_model_input.py b/tests/test_model_input.py new file mode 100644 index 0000000..4d9fac0 --- /dev/null +++ b/tests/test_model_input.py @@ -0,0 +1,52 @@ +import numpy as np +from numpy.testing import assert_array_equal + +from sklearn.datasets import load_digits +from deepforest import CascadeForestClassifier + + +toy_kwargs = { + "n_bins": 10, + "bin_subsample": 2e5, + "max_layers": 10, + "n_estimators": 1, + "criterion": "gini", + "n_trees": 100, + "max_depth": 3, + "min_samples_leaf": 1, + "use_predictor": True, + "predictor": "forest", + "predictor_kwargs": {}, + "n_tolerant_rounds": 2, + "delta": 1e-5, + "n_jobs": -1, + "random_state": 0, + "verbose": 2, +} + + +def test_model_input_label_encoder(): + """Test if the model behaves the same with and without label encoding.""" + + # Load data + X, y = load_digits(return_X_y=True) + y_as_str = np.char.add("label_", y.astype(str)) + + # Train model on integer labels. Labels should look like: 1, 2, 3, ... + model = CascadeForestClassifier(**toy_kwargs) + model.fit(X, y) + y_pred_int_labels = model.predict(X) + + # Train model on string labels. Labels should look like: "label_1", "label_2", "label_3", ... + model = CascadeForestClassifier(**toy_kwargs) + model.fit(X, y_as_str) + y_pred_str_labels = model.predict(X) + + # Check if the underlying data are the same + y_pred_int_labels_as_str = np.char.add( + "label_", y_pred_int_labels.astype(str) + ) + assert_array_equal(y_pred_str_labels, y_pred_int_labels_as_str) + + # Clean up buffer + model.clean() diff --git a/tests/test_model.py b/tests/test_model_regressor.py similarity index 59% rename from tests/test_model.py rename to tests/test_model_regressor.py index 8a625bd..1149019 100644 --- a/tests/test_model.py +++ b/tests/test_model_regressor.py @@ -1,54 +1,64 @@ import copy import pytest import shutil +import numpy as np from numpy.testing import assert_array_equal -from sklearn.datasets import load_iris +from sklearn.datasets import load_boston from sklearn.model_selection import train_test_split import deepforest -from deepforest import CascadeForestClassifier +from deepforest import CascadeForestRegressor from deepforest.cascade import _get_predictor_kwargs save_dir = "./tmp" # Load data -X, y = load_iris(return_X_y=True) +X, y = load_boston(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split( - X, y, test_size=0.42, random_state=42) + X, y, test_size=0.42, random_state=42 +) # Parameters -toy_kwargs = {"n_bins": 10, - "bin_subsample": 2e5, - "max_layers": 10, - "n_estimators": 1, - "n_trees": 100, - "max_depth": 3, - "min_samples_leaf": 1, - "use_predictor": True, - "predictor": "forest", - "predictor_kwargs": {}, - "n_tolerant_rounds": 2, - "delta": 1e-5, - "n_jobs": -1, - "random_state": 0, - "verbose": 2} - -kwargs = {"n_bins": 255, - "bin_subsample": 2e5, - "max_layers": 10, - "n_estimators": 2, - "n_trees": 100, - "max_depth": None, - "min_samples_leaf": 1, - "use_predictor": True, - "predictor": "forest", - "predictor_kwargs": {}, - "n_tolerant_rounds": 2, - "delta": 1e-5, - "n_jobs": -1, - "random_state": 0, - "verbose": 2} +toy_kwargs = { + "n_bins": 10, + "bin_subsample": 2e5, + "max_layers": 10, + "criterion": "mse", + "n_estimators": 1, + "n_trees": 100, + "max_depth": 3, + "min_samples_split": 2, + "min_samples_leaf": 1, + "use_predictor": True, + "predictor": "forest", + "predictor_kwargs": {}, + "n_tolerant_rounds": 2, + "delta": 1e-5, + "n_jobs": -1, + "random_state": 0, + "verbose": 2, +} + +kwargs = { + "n_bins": 255, + "bin_subsample": 2e5, + "max_layers": 10, + "criterion": "mse", + "n_estimators": 2, + "n_trees": 100, + "max_depth": None, + "min_samples_split": 2, + "min_samples_leaf": 1, + "use_predictor": True, + "predictor": "forest", + "predictor_kwargs": {}, + "n_tolerant_rounds": 2, + "delta": 1e-5, + "n_jobs": -1, + "random_state": 0, + "verbose": 2, +} @pytest.mark.parametrize( @@ -74,14 +84,14 @@ def test_predictor_kwargs_overwrite(test_input, expected): def test_model_properties_after_fitting(): """Check the model properties after fitting a deep forest model.""" - model = CascadeForestClassifier(**toy_kwargs) + model = CascadeForestRegressor(**toy_kwargs) model.fit(X_train, y_train) assert len(model) == model.n_layers_ assert model[0] is model._get_layer(0) - with pytest.raises(ValueError) as excinfo: + with pytest.raises(IndexError) as excinfo: model._get_layer(model.n_layers_) assert "The layer index should be in the range" in str(excinfo.value) @@ -98,26 +108,28 @@ def test_model_properties_after_fitting(): assert "already exists in the internal container" in str(excinfo.value) -def test_model_workflow_partial_mode(): +@pytest.mark.parametrize("backend", ["custom", "sklearn"]) +def test_model_workflow_partial_mode(backend): """Run the workflow of deep forest with a local buffer.""" case_kwargs = copy.deepcopy(kwargs) case_kwargs.update({"partial_mode": True}) + case_kwargs.update({"backend": backend}) - model = CascadeForestClassifier(**case_kwargs) + model = CascadeForestRegressor(**case_kwargs) model.fit(X_train, y_train) # Predictions before saving - y_pred_before = model.predict(X_test) + y_pred_before = model.predict(X_test).astype(np.float32) # Save and Reload model.save(save_dir) - model = CascadeForestClassifier(**case_kwargs) + model = CascadeForestRegressor(**case_kwargs) model.load(save_dir) # Predictions after loading - y_pred_after = model.predict(X_test) + y_pred_after = model.predict(X_test).astype(np.float32) # Make sure the same predictions before and after model serialization assert_array_equal(y_pred_before, y_pred_after) @@ -126,41 +138,48 @@ def test_model_workflow_partial_mode(): shutil.rmtree(save_dir) -def test_model_workflow_in_memory(): +@pytest.mark.parametrize("backend", ["custom", "sklearn"]) +def test_model_workflow_in_memory(backend): """Run the workflow of deep forest with in-memory mode.""" case_kwargs = copy.deepcopy(kwargs) case_kwargs.update({"partial_mode": False}) + case_kwargs.update({"backend": backend}) - model = CascadeForestClassifier(**case_kwargs) + model = CascadeForestRegressor(**case_kwargs) model.fit(X_train, y_train) # Predictions before saving - y_pred_before = model.predict(X_test) + y_pred_before = model.predict(X_test).astype(np.float32) # Save and Reload model.save(save_dir) - model = CascadeForestClassifier(**case_kwargs) + model = CascadeForestRegressor(**case_kwargs) model.load(save_dir) # Make sure the same predictions before and after model serialization - y_pred_after = model.predict(X_test) + y_pred_after = model.predict(X_test).astype(np.float32) assert_array_equal(y_pred_before, y_pred_after) shutil.rmtree(save_dir) -@pytest.mark.parametrize('param', - [(0, {"max_layers": 0}), - (1, {"n_tolerant_rounds": 0}), - (2, {"delta": -1})]) +@pytest.mark.parametrize( + "param", + [ + (0, {"max_layers": 0}), + (1, {"n_tolerant_rounds": 0}), + (2, {"delta": -1}), + (3, {"backend": "unknown"}), + ], +) def test_model_invalid_training_params(param): case_kwargs = copy.deepcopy(toy_kwargs) case_kwargs.update(param[1]) - model = CascadeForestClassifier(**case_kwargs) + model = CascadeForestRegressor(**case_kwargs) with pytest.raises(ValueError) as excinfo: model.fit(X_train, y_train) @@ -171,27 +190,29 @@ def test_model_invalid_training_params(param): assert "n_tolerant_rounds" in str(excinfo.value) elif param[0] == 2: assert "delta " in str(excinfo.value) + elif param[0] == 3: + assert "backend" in str(excinfo.value) -@pytest.mark.parametrize('predictor', ['forest', 'xgboost', 'lightgbm']) -def test_predictor_normal(predictor): - deepforest.cascade._build_predictor(predictor, - n_estimators=1, - n_outputs=2) +@pytest.mark.parametrize("predictor", ["forest", "xgboost", "lightgbm"]) +def test_regressor_predictor_normal(predictor): + deepforest.cascade._build_regressor_predictor( + predictor, criterion="mse", n_estimators=1, n_outputs=2 + ) -def test_predictor_unknown(): +def test_regressor_predictor_unknown(): with pytest.raises(NotImplementedError) as excinfo: - deepforest.cascade._build_predictor("unknown", - n_estimators=1, - n_outputs=2) + deepforest.cascade._build_regressor_predictor( + "unknown", criterion="mse", n_estimators=1, n_outputs=2 + ) assert "name of the predictor should be one of" in str(excinfo.value) def test_model_n_trees_non_positive(): case_kwargs = copy.deepcopy(toy_kwargs) case_kwargs.update({"n_trees": 0}) - model = CascadeForestClassifier(**case_kwargs) + model = CascadeForestRegressor(**case_kwargs) with pytest.raises(ValueError) as excinfo: model._set_n_trees(0) assert "should be strictly positive." in str(excinfo.value) @@ -200,7 +221,7 @@ def test_model_n_trees_non_positive(): def test_model_n_trees_auto(): case_kwargs = copy.deepcopy(toy_kwargs) case_kwargs.update({"n_trees": "auto"}) - model = CascadeForestClassifier(**case_kwargs) + model = CascadeForestRegressor(**case_kwargs) n_trees = model._set_n_trees(0) assert n_trees == 100 @@ -215,7 +236,7 @@ def test_model_n_trees_auto(): def test_model_n_trees_invalid(): case_kwargs = copy.deepcopy(toy_kwargs) case_kwargs.update({"n_trees": [42]}) - model = CascadeForestClassifier(**case_kwargs) + model = CascadeForestRegressor(**case_kwargs) with pytest.raises(ValueError) as excinfo: model._set_n_trees(0) assert "Invalid value for n_trees." in str(excinfo.value) diff --git a/tests/test_set_custom_estimator.py b/tests/test_set_custom_estimator.py new file mode 100644 index 0000000..6ec5d55 --- /dev/null +++ b/tests/test_set_custom_estimator.py @@ -0,0 +1,227 @@ +import pytest +import shutil +import numpy as np +from numpy.testing import assert_array_equal +from sklearn.datasets import load_iris, load_boston +from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor +from sklearn.model_selection import train_test_split + +from deepforest import CascadeForestClassifier, CascadeForestRegressor + + +save_dir = "./tmp" + +# Load data +X, y = load_iris(return_X_y=True) +X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split( + X, y, test_size=0.42, random_state=42 +) + +X, y = load_boston(return_X_y=True) +X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split( + X, y, test_size=0.42, random_state=42 +) + +# multi-output target values +y_train_reg_multi = np.array([y_train_reg, y_train_reg]).reshape(-1, 2) + + +def test_classifier_custom_cascade_layer_workflow_in_memory(): + + model = CascadeForestClassifier() + + n_estimators = 4 + estimators = [DecisionTreeClassifier() for _ in range(n_estimators)] + model.set_estimator(estimators) # set custom base estimators + + predictor = DecisionTreeClassifier() + model.set_predictor(predictor) + + model.fit(X_train_clf, y_train_clf) + y_pred_before = model.predict(X_test_clf) + + # Save and Reload + model.save(save_dir) + + model = CascadeForestClassifier() + model.load(save_dir) + + # Predictions after loading + y_pred_after = model.predict(X_test_clf) + + # Make sure the same predictions before and after model serialization + assert_array_equal(y_pred_before, y_pred_after) + + assert ( + model.get_estimator(0, 0, "custom") + is model._get_layer(0).estimators_["0-0-custom"].estimator_ + ) + + model.clean() # clear the buffer + shutil.rmtree(save_dir) + + +def test_classifier_custom_cascade_layer_workflow_partial_mode(): + + model = CascadeForestClassifier(partial_mode=True) + + n_estimators = 4 + estimators = [DecisionTreeClassifier() for _ in range(n_estimators)] + model.set_estimator(estimators) # set custom base estimators + + predictor = DecisionTreeClassifier() + model.set_predictor(predictor) + + model.fit(X_train_clf, y_train_clf) + y_pred_before = model.predict(X_test_clf) + + # Save and Reload + model.save(save_dir) + + model = CascadeForestClassifier() + model.load(save_dir) + + # Predictions after loading + y_pred_after = model.predict(X_test_clf) + + # Make sure the same predictions before and after model serialization + assert_array_equal(y_pred_before, y_pred_after) + + model.clean() # clear the buffer + shutil.rmtree(save_dir) + + +@pytest.mark.parametrize("y_train", [y_train_reg, y_train_reg_multi]) +def test_regressor_custom_cascade_layer_workflow_in_memory(y_train): + + model = CascadeForestRegressor() + + n_estimators = 4 + estimators = [DecisionTreeRegressor() for _ in range(n_estimators)] + model.set_estimator(estimators) # set custom base estimators + + predictor = DecisionTreeRegressor() + model.set_predictor(predictor) + + model.fit(X_train_reg, y_train) + y_pred_before = model.predict(X_test_reg) + + # Save and Reload + model.save(save_dir) + + model = CascadeForestRegressor() + model.load(save_dir) + + # Predictions after loading + y_pred_after = model.predict(X_test_reg) + + # Make sure the same predictions before and after model serialization + assert_array_equal(y_pred_before, y_pred_after) + + assert ( + model.get_estimator(0, 0, "custom") + is model._get_layer(0).estimators_["0-0-custom"].estimator_ + ) + + model.clean() # clear the buffer + shutil.rmtree(save_dir) + + +@pytest.mark.parametrize("y_train", [y_train_reg, y_train_reg_multi]) +def test_regressor_custom_cascade_layer_workflow_partial_mode(y_train): + + model = CascadeForestRegressor(partial_mode=True) + + n_estimators = 4 + estimators = [DecisionTreeRegressor() for _ in range(n_estimators)] + model.set_estimator(estimators) # set custom base estimators + + predictor = DecisionTreeRegressor() + model.set_predictor(predictor) + + model.fit(X_train_reg, y_train) + y_pred_before = model.predict(X_test_reg) + + # Save and Reload + model.save(save_dir) + + model = CascadeForestRegressor() + model.load(save_dir) + + # Predictions after loading + y_pred_after = model.predict(X_test_reg) + + # Make sure the same predictions before and after model serialization + assert_array_equal(y_pred_before, y_pred_after) + + model.clean() # clear the buffer + shutil.rmtree(save_dir) + + +def test_custom_base_estimator_wrong_estimator_type(): + + model = CascadeForestClassifier() + with pytest.raises(ValueError) as excinfo: + model.set_estimator(42) + assert "estimators should be a list" in str(excinfo.value) + + +def test_custom_estimator_missing_fit(): + class tmp_estimator: + def __init__(self): + pass + + model = CascadeForestClassifier() + with pytest.raises(AttributeError) as excinfo: + model.set_estimator([tmp_estimator()]) + assert "The `fit` method of estimator" in str(excinfo.value) + + with pytest.raises(AttributeError) as excinfo: + model.set_predictor(tmp_estimator()) + assert "The `fit` method of the predictor" in str(excinfo.value) + + +def test_custom_base_estimator_missing_predict_proba(): + class tmp_estimator: + def __init__(self): + pass + + def fit(self, X, y): + pass + + model = CascadeForestClassifier() + with pytest.raises(AttributeError) as excinfo: + model.set_estimator([tmp_estimator()]) + assert "The `predict_proba` method" in str(excinfo.value) + + with pytest.raises(AttributeError) as excinfo: + model.set_predictor(tmp_estimator()) + assert "The `predict_proba` method of the predictor" in str(excinfo.value) + + +def test_custom_base_estimator_missing_predict(): + class tmp_estimator: + def __init__(self): + pass + + def fit(self, X, y): + pass + + model = CascadeForestRegressor() + with pytest.raises(AttributeError) as excinfo: + model.set_estimator([tmp_estimator()]) + assert "The `predict` method" in str(excinfo.value) + + with pytest.raises(AttributeError) as excinfo: + model.set_predictor(tmp_estimator()) + assert "The `predict` method of the predictor" in str(excinfo.value) + + +def test_custom_base_estimator_invalid_n_splits(): + + model = CascadeForestRegressor() + n_estimators = 4 + estimators = [DecisionTreeClassifier() for _ in range(n_estimators)] + with pytest.raises(ValueError) as excinfo: + model.set_estimator(estimators, n_splits=1) + assert "should be at least 2" in str(excinfo.value) diff --git a/tests/test_tree regressor.py b/tests/test_tree regressor.py new file mode 100644 index 0000000..2b43180 --- /dev/null +++ b/tests/test_tree regressor.py @@ -0,0 +1,66 @@ +import pytest +from sklearn.datasets import load_boston +from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper + +from deepforest import DecisionTreeRegressor + + +X, y = load_boston(return_X_y=True) + +# Data binning +binner = _BinMapper(random_state=42) +X_binned = binner.fit_transform(X) + + +def test_tree_properties_after_fitting(): + tree = DecisionTreeRegressor() + tree.fit(X_binned, y) + + assert tree.get_depth() == tree.tree_.max_depth + assert tree.n_leaves == tree.tree_.n_leaves + assert tree.n_internals == tree.tree_.n_internals + + +def test_tree_fit_invalid_dtype(): + tree = DecisionTreeRegressor() + + with pytest.raises(RuntimeError) as execinfo: + tree.fit(X, y) + assert "The dtype of `X` should be `np.uint8`" in str(execinfo.value) + + +def test_tree_fit_invalid_training_params(): + tree = DecisionTreeRegressor(min_samples_leaf=0) + with pytest.raises(ValueError) as execinfo: + tree.fit(X_binned, y) + assert "min_samples_leaf must be at least 1" in str(execinfo.value) + + tree = DecisionTreeRegressor(min_samples_leaf=0.6) + with pytest.raises(ValueError) as execinfo: + tree.fit(X_binned, y) + assert "or in (0, 0.5]" in str(execinfo.value) + + tree = DecisionTreeRegressor(min_samples_split=1) + with pytest.raises(ValueError) as execinfo: + tree.fit(X_binned, y) + assert "min_samples_split must be an integer" in str(execinfo.value) + + tree = DecisionTreeRegressor(max_features="unknown") + with pytest.raises(ValueError) as execinfo: + tree.fit(X_binned, y) + assert "Invalid value for max_features." in str(execinfo.value) + + tree = DecisionTreeRegressor() + with pytest.raises(ValueError) as execinfo: + tree.fit(X_binned, y[:1]) + assert "Number of labels=" in str(execinfo.value) + + tree = DecisionTreeRegressor(min_weight_fraction_leaf=0.6) + with pytest.raises(ValueError) as execinfo: + tree.fit(X_binned, y) + assert "min_weight_fraction_leaf must in [0, 0.5]" in str(execinfo.value) + + tree = DecisionTreeRegressor(max_depth=0) + with pytest.raises(ValueError) as execinfo: + tree.fit(X_binned, y) + assert "max_depth must be greater than zero." in str(execinfo.value) diff --git a/tests/test_tree.py b/tests/test_tree_classifier.py similarity index 100% rename from tests/test_tree.py rename to tests/test_tree_classifier.py diff --git a/tests/test_tree_same.py b/tests/test_tree_same.py index d380b05..d0e0335 100644 --- a/tests/test_tree_same.py +++ b/tests/test_tree_same.py @@ -1,26 +1,32 @@ """ -Testing cases here make sure that the outputs of the reduced implementation -on `DecisionTreeClassifier` and `ExtraTreeClassifier` are exactly the same as -the original version in Scikit-Learn after the data binning. +Testing cases here make sure that predictions of the reduced implementation +on decision tree is exactly the same as the original version in Scikit-Learn +after data binning. """ import pytest +import numpy as np from numpy.testing import assert_array_equal from sklearn.tree import ( DecisionTreeClassifier as sklearn_DecisionTreeClassifier, ) +from sklearn.tree import ( + DecisionTreeRegressor as sklearn_DecisionTreeRegressor, +) from sklearn.tree import ExtraTreeClassifier as sklearn_ExtraTreeClassifier +from sklearn.tree import ExtraTreeRegressor as sklearn_ExtraTreeRegressor # Load utils from sklearn.model_selection import train_test_split from sklearn.ensemble._hist_gradient_boosting.binning import _BinMapper -# Toy classification datasets -from sklearn.datasets import load_iris, load_wine +# Toy datasets +from sklearn.datasets import load_iris, load_wine, load_boston from deepforest import DecisionTreeClassifier from deepforest import ExtraTreeClassifier - +from deepforest import DecisionTreeRegressor +from deepforest import ExtraTreeRegressor test_size = 0.42 random_state = 42 @@ -81,3 +87,115 @@ def test_extra_tree_classifier_proba(load_func): assert_array_equal(actual_pred, expected_pred) assert_array_equal(actual_proba, expected_proba) + + +@pytest.mark.parametrize("load_func", [load_boston]) +def test_tree_regressor_pred(load_func): + + X, y = load_func(return_X_y=True) + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=test_size, random_state=random_state + ) + + # Data binning + binner = _BinMapper(random_state=random_state) + X_train_binned = binner.fit_transform(X_train) + X_test_binned = binner.transform(X_test) + + # Ours + model = DecisionTreeRegressor(random_state=random_state) + model.fit(X_train_binned, y_train) + actual_pred = model.predict(X_test_binned) + + # Sklearn + model = sklearn_DecisionTreeRegressor(random_state=random_state) + model.fit(X_train_binned, y_train) + expected_pred = model.predict(X_test_binned) + + assert_array_equal(actual_pred, expected_pred) + + +@pytest.mark.parametrize("load_func", [load_boston]) +def test_extra_tree_regressor_pred(load_func): + X, y = load_func(return_X_y=True) + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=test_size, random_state=random_state + ) + + # Data binning + binner = _BinMapper(random_state=random_state) + X_train_binned = binner.fit_transform(X_train) + X_test_binned = binner.transform(X_test) + + # Ours + model = ExtraTreeRegressor(random_state=random_state) + model.fit(X_train_binned, y_train) + actual_pred = model.predict(X_test_binned) + + # Sklearn + model = sklearn_ExtraTreeRegressor(random_state=random_state) + model.fit(X_train_binned, y_train) + expected_pred = model.predict(X_test_binned) + + assert_array_equal(actual_pred, expected_pred) + + +@pytest.mark.parametrize("load_func", [load_boston]) +def test_tree_regressor_multi_output_pred(load_func): + + X, y = load_func(return_X_y=True) + + # Generate pseudo multi output targets + y = np.expand_dims(y, axis=1) + y = np.concatenate((y, -y), axis=1) + + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=test_size, random_state=random_state + ) + + # Data binning + binner = _BinMapper(random_state=random_state) + X_train_binned = binner.fit_transform(X_train) + X_test_binned = binner.transform(X_test) + + # Ours + model = DecisionTreeRegressor(random_state=random_state) + model.fit(X_train_binned, y_train) + actual_pred = model.predict(X_test_binned) + + # Sklearn + model = sklearn_DecisionTreeRegressor(random_state=random_state) + model.fit(X_train_binned, y_train) + expected_pred = model.predict(X_test_binned) + + assert_array_equal(actual_pred, expected_pred) + + +@pytest.mark.parametrize("load_func", [load_boston]) +def test_extra_tree_regressor_multi_output_pred(load_func): + X, y = load_func(return_X_y=True) + + # Generate pseudo multi output targets + y = np.expand_dims(y, axis=1) + y = np.concatenate((y, -y), axis=1) + + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=test_size, random_state=random_state + ) + + # Data binning + binner = _BinMapper(random_state=random_state) + X_train_binned = binner.fit_transform(X_train) + X_test_binned = binner.transform(X_test) + + # Ours + model = ExtraTreeRegressor(random_state=random_state) + model.fit(X_train_binned, y_train) + actual_pred = model.predict(X_test_binned) + + # Sklearn + model = sklearn_ExtraTreeRegressor(random_state=random_state) + model.fit(X_train_binned, y_train) + expected_pred = model.predict(X_test_binned) + + assert_array_equal(actual_pred, expected_pred)