diff --git a/.github/images/logo_black.png b/.github/images/logo_black.png new file mode 100644 index 0000000..59cdef0 Binary files /dev/null and b/.github/images/logo_black.png differ diff --git a/.github/images/logo_white.png b/.github/images/logo_white.png new file mode 100644 index 0000000..073885a Binary files /dev/null and b/.github/images/logo_white.png differ diff --git a/README.md b/README.md index 29c3dc1..b23e046 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,7 @@ +

+ logo +

+ # arm-preprocessing ![PyPI Version](https://img.shields.io/pypi/v/arm-preprocessing.svg) [![arm-preprocessing](https://github.com/firefly-cpp/arm-preprocessing/actions/workflows/test.yml/badge.svg)](https://github.com/firefly-cpp/arm-preprocessing/actions/workflows/test.yml) @@ -18,7 +22,7 @@ arm-preprocessing is a lightweight Python library supporting several key steps involving data preparation, manipulation, and discretisation for Association Rule Mining (ARM). 🧠 Embrace its minimalistic design that prioritises simplicity. 💡 The framework is intended to be fully extensible and offers seamless integration with related ARM libraries (e.g., [NiaARM](https://github.com/firefly-cpp/NiaARM)). 🔗 ## Why arm-preprocessing? -While numerous libraries facilitate data mining preprocessing tasks, this library is designed to integrate seamlessly with association rule mining. It harmonizes well with the NiaARM library, a robust numerical association rule mining framework. The primary aim is to bridge the gap between preprocessing and rule mining, simplifying the workflow/pipeline. Additionally, its design allows for the effortless incorporation of new preprocessing methods and fast benchmarking. +While numerous libraries facilitate data mining preprocessing tasks, this library is designed to integrate seamlessly with association rule mining. It harmonises well with the NiaARM library, a robust numerical association rule mining framework. The primary aim is to bridge the gap between preprocessing and rule mining, simplifying the workflow/pipeline. Additionally, its design allows for the effortless incorporation of new preprocessing methods and fast benchmarking. ## Key features ✨ - Loading various formats of datasets (CSV, JSON, TXT) 📊 @@ -28,6 +32,8 @@ While numerous libraries facilitate data mining preprocessing tasks, this librar - Dataset statistics 📈 - Discretisation methods 📏 - Data squashing methods 🤏 +- Feature scaling methods ⚖️ +- Feature selection methods 🎯 ## Installation 📦 ### pip @@ -35,6 +41,15 @@ To install ``arm-preprocessing`` with pip, use: ```bash pip install arm-preprocessing ``` +To install ``arm-preprocessing`` on Alpine Linux, please use: +```sh +$ apk add py3-arm-preprocessing +``` + +To install ``arm-preprocessing`` on Arch Linux, please use an [AUR helper](https://wiki.archlinux.org/title/AUR_helpers): +```sh +$ yay -Syyu python-arm-preprocessing +``` ## Usage 🚀 ### Data loading @@ -54,6 +69,23 @@ dataset.load_data() df = dataset.data ``` +### Missing values +The following example demonstrates how to handle missing values in a dataset using imputation. More examples can be found in the [examples/missing_values](./examples/missing_values) directory: +- [Handling missing values in a dataset using row deletion](./examples/missing_values/missing_values_rows.py) +- [Handling missing values in a dataset using column deletion](./examples/missing_values/missing_values_columns.py) +- [Handling missing values in a dataset using imputation](./examples/missing_values/missing_values_impute.py) + +```python +from arm_preprocessing.dataset import Dataset + +# Initialise dataset with filename and format +dataset = Dataset('examples/missing_values/data', format='csv') +dataset.load() + +# Impute missing data +dataset.missing_values(method='impute') +``` + ### Data discretisation The following example demonstrates how to discretise a dataset using the equal width method. More examples can be found in the [examples/discretisation](./examples/discretisation) directory: - [Discretising a dataset using the equal width method](./examples/discretisation/equal_width_discretisation.py) @@ -87,26 +119,43 @@ dataset.load() dataset.squash(threshold=0.75, similarity='euclidean') ``` -### Missing values -The following example demonstrates how to handle missing values in a dataset using imputation. More examples can be found in the [examples/missing_values](./examples/missing_values) directory: -- [Handling missing values in a dataset using row deletion](./examples/missing_values/missing_values_rows.py) -- [Handling missing values in a dataset using column deletion](./examples/missing_values/missing_values_columns.py) -- [Handling missing values in a dataset using imputation](./examples/missing_values/missing_values_impute.py) +### Feature scaling +The following example demonstrates how to scale the dataset's features. More examples can be found in the [examples/scaling](./examples/scaling) directory: +- [Scale features using normalisation](./examples/scaling/normalisation.py) +- [Scale features using standardisation](./examples/scaling/standardisation.py) ```python from arm_preprocessing.dataset import Dataset # Initialise dataset with filename and format -dataset = Dataset('examples/missing_values/data', format='csv') +dataset = Dataset('datasets/Abalone', format='csv') dataset.load() -# Impute missing data -dataset.missing_values(method='impute') +# Scale dataset using normalisation +dataset.scale(method='normalisation') +``` + +### Feature selection +The following example demonstrates how to select features from a dataset. More examples can be found in the [examples/feature_selection](./examples/feature_selection) directory: +- [Select features using the Kendall Tau correlation coefficient](./examples/feature_selection/feature_selection.py) + +```python +from arm_preprocessing.dataset import Dataset + +# Initialise dataset with filename and format +dataset = Dataset('datasets/sportydatagen', format='csv') +dataset.load() + +# Feature selection +dataset.feature_selection( + method='kendall', threshold=0.15, class_column='calories') ``` ## Related frameworks 🔗 -[1] [NiaARM: A minimalistic framework for Numerical Association Rule Mining](https://github.com/firefly-cpp/NiaARM) +[1] [NiaARM: A minimalistic framework for Numerical Association Rule Mining](https://github.com/firefly-cpp/NiaARM) + +[2] [uARMSolver: universal Association Rule Mining Solver](https://github.com/firefly-cpp/uARMSolver) ## References 📚 diff --git a/arm_preprocessing/__init__.py b/arm_preprocessing/__init__.py index df9144c..7fd229a 100644 --- a/arm_preprocessing/__init__.py +++ b/arm_preprocessing/__init__.py @@ -1 +1 @@ -__version__ = '0.1.1' +__version__ = '0.2.0' diff --git a/arm_preprocessing/dataset.py b/arm_preprocessing/dataset.py index 8092616..0e28cba 100644 --- a/arm_preprocessing/dataset.py +++ b/arm_preprocessing/dataset.py @@ -22,7 +22,7 @@ class Dataset: data (pd.DataFrame): Dataset. """ - def __init__(self, filename, format="csv", target_format=None, datetime_columns=[]): + def __init__(self, filename=None, format="csv", target_format=None, datetime_columns=[]): """ Initialise a Dataset instance. @@ -203,7 +203,7 @@ def missing_values(self, method): if self.data[column].dtype == 'object': self.data[column].fillna( self.data[column].mode()[0], inplace=True) - elif self.data[column].dtype == 'datetime64[ns]': + elif self.data[column].dtype == 'datetime64[ns]' or self.data[column].dtype == 'category': self.data[column].fillna( self.data[column].mode()[0], inplace=True) else: @@ -258,6 +258,66 @@ def squash(self, threshold, similarity="euclidean"): # Squash data self.data = Squash.squash(self.data, threshold, similarity) + def scale(self, method): + """ + Scale the dataset using the specified method. + + Args: + method (str): Scaling method ('normalisation', 'standardisation'). + + Raises: + ValueError: Invalid scaling method. + """ + # Validate method + if method not in ['normalisation', 'standardisation']: + raise ValueError(f'Invalid scaling method: {method}') + + # Scale data + for column in self.data.columns: + # Skip non-numerical columns + if self.data[column].dtype in ['datetime64[ns]', 'object']: + continue + + if method == 'normalisation': + self.data[column] = ( + self.data[column] - self.data[column].min() + ) / (self.data[column].max() - self.data[column].min()) + elif method == 'standardisation': + self.data[column] = ( + self.data[column] - self.data[column].mean() + ) / self.data[column].std() + + def feature_selection(self, method, threshold, class_column): + """ + Select features based on the specified threshold. + + Args: + method (str): Feature selection method ('pearson', 'spearman', 'kendall'). + threshold (float): Threshold. + class_column (str): Name of the column containing class labels. + + Raises: + ValueError: Invalid feature selection method. + ValueError: Column is not numerical. + + Returns: + None + """ + # Validate method + if method not in ['pearson', 'spearman', 'kendall']: + raise ValueError(f'Invalid feature selection method: {method}') + + # Raise ValueError if column in self.data is not numerical + for column in self.data.columns: + if self.data[column].dtype not in ['int64', 'float64']: + raise ValueError(f'Column {column} is not numerical') + + # Calculate feature importance + feature_importance = self.data.corr(method=method)[class_column] + + # Select features + self.data = self.data[feature_importance[feature_importance >= threshold].index] + def filter_between_dates( self, start_date=None, end_date=None, datetime_column=None ): diff --git a/datasets/README.md b/datasets/README.md index cf24572..be201f5 100644 --- a/datasets/README.md +++ b/datasets/README.md @@ -7,3 +7,5 @@ [3] The Breast Cancer dataset is downloaded from https://archive.ics.uci.edu/ml/index.php [4] The Nursery dataset is downloaded from https://archive.ics.uci.edu/ml/index.php + +[5] https://github.com/firefly-cpp/NiaARM diff --git a/docs/conf.py b/docs/conf.py index 56d0214..4e365d7 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -22,7 +22,7 @@ author = 'Tadej Lahovnik, Iztok Fister Jr.' # The full version, including alpha/beta/rc tags -release = '0.1.1' +release = '0.2.0' # -- General configuration --------------------------------------------------- @@ -53,9 +53,8 @@ # html_static_path = ['_static'] # Add logo for project -""" html_logo = '../.github/images/logo.png' +html_logo = '../.github/images/logo_white.png' html_theme_options = { 'logo_only': True, 'display_version': False, } - """ diff --git a/docs/user/usage.rst b/docs/user/usage.rst index 1d93715..982ec98 100644 --- a/docs/user/usage.rst +++ b/docs/user/usage.rst @@ -1,6 +1,3 @@ -User documentation ------------------- - Installation ============ @@ -10,15 +7,29 @@ To install ``arm-preprocessing`` with pip, use: pip install arm-preprocessing +To install ``arm-preprocessing`` on Alpine Linux, use: + +.. code:: bash + + $ apk add py3-arm-preprocessing + +To install ``arm-preprocessing`` on Arch Linux, use an `AUR helper `_: + +.. code:: bash + + $ yay -Syyu python-arm-preprocessing + Usage ===== This section demonstrates the usage of the arm-preprocessing framework. * :ref:`data loading` +* :ref:`missing values` * :ref:`data discretisation` * :ref:`data squashing` -* :ref:`missing values` +* :ref:`feature scaling` +* :ref:`feature selection` .. _data loading: @@ -67,6 +78,54 @@ The following examples demonstrate how to load a dataset from a file (csv, json, # Print dataset information (columns, categories, min/max values, etc.) dataset.dataset_statistics() + .. _missing values: + +.. _missing values: + +Missing values +~~~~~~~~~~~~~~ + +The following examples demonstrate how to handle missing values in a dataset. + +.. code:: python + + from arm_preprocessing.dataset import Dataset + + # Initialise dataset with filename and format + dataset = Dataset('examples/missing_values/data', format='csv') + + # Load dataset + dataset.load() + + # Remove columns with missing data + dataset.missing_values(method='column') + +.. code:: python + + from arm_preprocessing.dataset import Dataset + + # Initialise dataset with filename and format + dataset = Dataset('examples/missing_values/data', format='csv') + + # Load dataset + dataset.load() + + # Remove rows with missing data + dataset.missing_values(method='row') + +.. code:: python + + from arm_preprocessing.dataset import Dataset + + # Initialise dataset with filename and format + dataset = Dataset('examples/missing_values/data', format='csv') + + # Load dataset + dataset.load() + + # Impute missing data + dataset.missing_values(method='impute') + .. _data discretisation: Data discretisation @@ -153,48 +212,50 @@ The following examples demonstrate how to squash a dataset. # Squash dataset dataset.squash(threshold=0.99, similarity='cosine') -.. _missing values: +.. _feature scaling: -Missing values -~~~~~~~~~~~~~~ +Feature scaling +~~~~~~~~~~~~~~~ -The following examples demonstrate how to handle missing values in a dataset. +The following examples demonstrate how to scale a dataset. .. code:: python from arm_preprocessing.dataset import Dataset # Initialise dataset with filename and format - dataset = Dataset('examples/missing_values/data', format='csv') - - # Load dataset + dataset = Dataset('datasets/Abalone', format='csv') dataset.load() - # Remove columns with missing data - dataset.missing_values(method='column') + # Scale dataset using normalisation + dataset.scale(method='normalisation') .. code:: python from arm_preprocessing.dataset import Dataset # Initialise dataset with filename and format - dataset = Dataset('examples/missing_values/data', format='csv') - - # Load dataset + dataset = Dataset('datasets/Abalone', format='csv') dataset.load() - # Remove rows with missing data - dataset.missing_values(method='row') + # Scale dataset using standardisation + dataset.scale(method='standardisation') + +.. _feature selection: + +Feature selection +~~~~~~~~~~~~~~~~~ + +The following examples demonstrate how to select features from a dataset. .. code:: python from arm_preprocessing.dataset import Dataset # Initialise dataset with filename and format - dataset = Dataset('examples/missing_values/data', format='csv') - - # Load dataset + dataset = Dataset('datasets/sportydatagen', format='csv') dataset.load() - # Impute missing data - dataset.missing_values(method='impute') \ No newline at end of file + # Feature selection + dataset.feature_selection( + method='kendall', threshold=0.15, class_column='calories') \ No newline at end of file diff --git a/examples/feature_selection/feature_selection.py b/examples/feature_selection/feature_selection.py new file mode 100644 index 0000000..8b5f7a9 --- /dev/null +++ b/examples/feature_selection/feature_selection.py @@ -0,0 +1,14 @@ +""" +Example demonstrates how to keep only +the most important features in a dataset. +""" + +from arm_preprocessing.dataset import Dataset + +# Initialise dataset with filename and format +dataset = Dataset('datasets/sportydatagen', format='csv') +dataset.load() + +# Feature selection +dataset.feature_selection( + method='kendall', threshold=0.15, class_column='calories') diff --git a/examples/niaarm/niaarm_integration.py b/examples/niaarm/niaarm_integration.py new file mode 100644 index 0000000..9bdfb9a --- /dev/null +++ b/examples/niaarm/niaarm_integration.py @@ -0,0 +1,45 @@ +from arm_preprocessing.dataset import Dataset +import niaarm +from niapy.algorithms.basic import DifferentialEvolution + +# Load dataset +dataset = Dataset('datasets/Abalone', format='csv') +dataset.load() + +# Squash dataset +dataset.squash(threshold=0.85, similarity='euclidean') + +# Impute missing values +dataset.missing_values(method='impute') + +# Drop 'Sex' column +dataset.data.drop('Sex', axis=1, inplace=True) + +# Scale dataset +dataset.scale(method='normalisation') + +# Feature selection +dataset.feature_selection( + method='kendall', threshold=0.25, class_column='Rings') + +# Discretise dataset using equal width, equal frequency, and k-means +dataset.discretise(method='equal_width', num_bins=10, columns=['Height']) +dataset.discretise(method='equal_frequency', num_bins=5, columns=['Diameter']) +dataset.discretise(method='kmeans', num_bins=5, columns=[ + 'Whole weight', 'Shell weight']) + +# Identify dataset and output dataset statistics +dataset.identify_dataset() +dataset.dataset_statistics() + +# Association rule mining +algo = DifferentialEvolution( + population_size=50, differential_weight=0.5, crossover_probability=0.9) +metrics = ('support', 'confidence') +rules, run_time = niaarm.get_rules( + niaarm.Dataset(dataset.data), algo, metrics, max_iters=30, logging=True) + +# Results +print(rules) +print(f'Run Time: {run_time}') +rules.to_csv('output.csv') diff --git a/examples/scaling/normalisation.py b/examples/scaling/normalisation.py new file mode 100644 index 0000000..13dae04 --- /dev/null +++ b/examples/scaling/normalisation.py @@ -0,0 +1,13 @@ +""" +Example demonstrates how to scale +the using normalisation +""" + +from arm_preprocessing.dataset import Dataset + +# Initialise dataset with filename and format +dataset = Dataset('datasets/Abalone', format='csv') +dataset.load() + +# Scale dataset using normalisation +dataset.scale(method='normalisation') diff --git a/examples/scaling/standardisation.py b/examples/scaling/standardisation.py new file mode 100644 index 0000000..7a694f9 --- /dev/null +++ b/examples/scaling/standardisation.py @@ -0,0 +1,13 @@ +""" +Example demonstrates how to scale +the using standardisation +""" + +from arm_preprocessing.dataset import Dataset + +# Initialise dataset with filename and format +dataset = Dataset('datasets/Abalone', format='csv') +dataset.load() + +# Scale dataset using standardisation +dataset.scale(method='standardisation') diff --git a/pyproject.toml b/pyproject.toml index 98b9cae..22c4fe2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "arm-preprocessing" -version = "0.1.1" +version = "0.2.0" description = "Implementation of several preprocessing techniques for Association Rule Mining (ARM)" authors = ["Tadej Lahovnik ", "Iztok Fister Jr. "] keywords = ['association rule mining', 'data science', 'preprocessing'] diff --git a/tests/test_dataset.py b/tests/test_dataset.py index 7f4d6a4..fc3556e 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -134,6 +134,69 @@ def test_missing_values_invalid_method(): dataset.missing_values(method='invalid_method') +def test_feature_scaling_normalisation(): + # Test feature scaling using normalisation + dataset = Dataset('datasets/Abalone', format='csv') + dataset.load() + dataset.scale(method='normalisation') + for column in dataset.data.columns: + # Skip non-numerical columns + if dataset.data[column].dtype in ['datetime64[ns]', 'object']: + continue + assert dataset.data[column].min() >= 0 + assert dataset.data[column].max() <= 1 + + +def test_feature_scaling_standardisation(): + # Test feature scaling using standardisation + dataset = Dataset('datasets/Abalone', format='csv') + dataset.load() + dataset.scale(method='standardisation') + for column in dataset.data.columns: + # Skip non-numerical columns + if dataset.data[column].dtype in ['datetime64[ns]', 'object']: + continue + assert dataset.data[column].mean() == pytest.approx(0, abs=0.01) + assert dataset.data[column].std() == pytest.approx(1, abs=0.01) + + +def test_feature_scaling_invalid_method(): + # Test invalid method handling + dataset = Dataset('datasets/Abalone', format='csv') + dataset.load() + with pytest.raises(ValueError, match='Invalid scaling method'): + dataset.scale(method='invalid_method') + + +def test_feature_selection_numerical(): + # Test feature selection for numerical dataset + dataset = Dataset('datasets/sportydatagen', format='csv') + dataset.load() + no_columns_before = len(dataset.data.columns) + dataset.feature_selection( + method='pearson', threshold=0.15, class_column='calories') + no_columns_after = len(dataset.data.columns) + assert no_columns_before > no_columns_after + + +def test_feature_selection_categorical(): + # Test feature selection for categorical dataset + dataset = Dataset('datasets/Abalone', format='csv') + dataset.load() + with pytest.raises(ValueError, match='Column .* is not numerical'): + dataset.feature_selection( + method='pearson', threshold=0.15, class_column='Rings') + + +def test_feature_selection_invalid_method(): + # Test invalid method handling + dataset = Dataset('datasets/sportydatagen', format='csv') + dataset.load() + with pytest.raises(ValueError, match='Invalid feature selection method'): + dataset.feature_selection( + method='invalid_method', threshold=0.15, class_column='calories') + + def test_filter_between_dates(): # Test filtering between dates dataset = Dataset(