diff --git a/.github/workflows/tests_01.yml b/.github/workflows/tests_01.yml index 9f12863f..3951257b 100644 --- a/.github/workflows/tests_01.yml +++ b/.github/workflows/tests_01.yml @@ -12,7 +12,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python: [3.11] + python: [3.12] env: BIGML_USERNAME: ${{ secrets.BIGML_USERNAME }} BIGML_API_KEY: ${{ secrets.BIGML_API_KEY }} diff --git a/.github/workflows/tests_05.yml b/.github/workflows/tests_05.yml index 8cc673ec..ed1cac5f 100644 --- a/.github/workflows/tests_05.yml +++ b/.github/workflows/tests_05.yml @@ -12,7 +12,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python: [3.11] + python: [3.12] env: BIGML_USERNAME: ${{ secrets.BIGML_USERNAME }} BIGML_API_KEY: ${{ secrets.BIGML_API_KEY }} diff --git a/.github/workflows/tests_22.yml b/.github/workflows/tests_22.yml index c75adca1..46784de2 100644 --- a/.github/workflows/tests_22.yml +++ b/.github/workflows/tests_22.yml @@ -12,7 +12,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python: [3.11] + python: [3.12] env: BIGML_USERNAME: ${{ secrets.BIGML_USERNAME }} BIGML_API_KEY: ${{ secrets.BIGML_API_KEY }} diff --git a/.github/workflows/tests_23.yml b/.github/workflows/tests_23.yml index 042d57cc..892a73d6 100644 --- a/.github/workflows/tests_23.yml +++ b/.github/workflows/tests_23.yml @@ -12,7 +12,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python: [3.11] + python: [3.12] env: BIGML_USERNAME: ${{ secrets.BIGML_USERNAME }} BIGML_API_KEY: ${{ secrets.BIGML_API_KEY }} diff --git a/.github/workflows/tests_36.yml b/.github/workflows/tests_36.yml index 7b78c0a5..a766fa97 100644 --- a/.github/workflows/tests_36.yml +++ b/.github/workflows/tests_36.yml @@ -12,7 +12,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python: [3.11] + python: [3.12] env: BIGML_USERNAME: ${{ secrets.BIGML_USERNAME }} BIGML_API_KEY: ${{ secrets.BIGML_API_KEY }} diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 7e93ed20..d74e663d 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -1,9 +1,22 @@ +# .readthedocs.yaml +# Read the Docs configuration file +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +# Required version: 2 +# Set the version of Python and other tools you might need build: os: ubuntu-22.04 tools: - python: "3.10" + python: "3.12" +# Build documentation in the docs/ directory with Sphinx sphinx: configuration: docs/conf.py + +# We recommend specifying your dependencies to enable reproducible builds: +# https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html +python: + install: + - requirements: docs/requirements.txt diff --git a/HISTORY.rst b/HISTORY.rst index 2aa2591a..6c85c8cd 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -3,6 +3,29 @@ History ------- +9.8.3 (2025-03-27) +------------------ + +- Fixing annotations update for regions as lists. + +9.8.2 (2025-03-21) +------------------ + +- Retrying annotations update to avoid temporary concurrency issues in + source composites updates. + +9.8.1 (2025-01-14) +------------------ + +- Fixing annotations update in images composite sources. + +9.8.0 (2024-10-02) +------------------ + +- Fixing the get_leaves function for local decision trees. +- Fixing setup issues in Python3.12 +- Changing documentation templates. + 9.8.0.dev1 (2024-02-28) ----------------------- diff --git a/bigml/anomaly.py b/bigml/anomaly.py index 07f3f6f0..4a345724 100644 --- a/bigml/anomaly.py +++ b/bigml/anomaly.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright 2020-2023 BigML +# Copyright 2020-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/api.py b/bigml/api.py index 21d80679..55b1e591 100644 --- a/bigml/api.py +++ b/bigml/api.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=too-many-ancestors,non-parent-init-called, unused-import, no-member # -# Copyright 2012-2023 BigML +# Copyright 2012-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/api_handlers/anomalyhandler.py b/bigml/api_handlers/anomalyhandler.py index 1bb07dd0..03ece5e2 100644 --- a/bigml/api_handlers/anomalyhandler.py +++ b/bigml/api_handlers/anomalyhandler.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=abstract-method # -# Copyright 2014-2023 BigML +# Copyright 2014-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/api_handlers/anomalyscorehandler.py b/bigml/api_handlers/anomalyscorehandler.py index fd0df39b..1398d539 100644 --- a/bigml/api_handlers/anomalyscorehandler.py +++ b/bigml/api_handlers/anomalyscorehandler.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=abstract-method # -# Copyright 2014-2023 BigML +# Copyright 2014-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/api_handlers/associationhandler.py b/bigml/api_handlers/associationhandler.py index c6957cf4..994a0050 100644 --- a/bigml/api_handlers/associationhandler.py +++ b/bigml/api_handlers/associationhandler.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=abstract-method # -# Copyright 2015-2023 BigML +# Copyright 2015-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/api_handlers/associationsethandler.py b/bigml/api_handlers/associationsethandler.py index cd8176c8..f1c13bb1 100644 --- a/bigml/api_handlers/associationsethandler.py +++ b/bigml/api_handlers/associationsethandler.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=abstract-method # -# Copyright 2015-2023 BigML +# Copyright 2015-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/api_handlers/batchanomalyscorehandler.py b/bigml/api_handlers/batchanomalyscorehandler.py index b55f6d27..07516a27 100644 --- a/bigml/api_handlers/batchanomalyscorehandler.py +++ b/bigml/api_handlers/batchanomalyscorehandler.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=abstract-method # -# Copyright 2014-2023 BigML +# Copyright 2014-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/api_handlers/batchcentroidhandler.py b/bigml/api_handlers/batchcentroidhandler.py index a5859d4a..79c25f52 100644 --- a/bigml/api_handlers/batchcentroidhandler.py +++ b/bigml/api_handlers/batchcentroidhandler.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=abstract-method # -# Copyright 2014-2023 BigML +# Copyright 2014-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/api_handlers/batchpredictionhandler.py b/bigml/api_handlers/batchpredictionhandler.py index 1f2da496..462d127a 100644 --- a/bigml/api_handlers/batchpredictionhandler.py +++ b/bigml/api_handlers/batchpredictionhandler.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=abstract-method # -# Copyright 2014-2023 BigML +# Copyright 2014-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/api_handlers/batchprojectionhandler.py b/bigml/api_handlers/batchprojectionhandler.py index 5d9dcbe0..bfb05228 100644 --- a/bigml/api_handlers/batchprojectionhandler.py +++ b/bigml/api_handlers/batchprojectionhandler.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=abstract-method # -# Copyright 2018-2023 BigML +# Copyright 2018-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/api_handlers/batchtopicdistributionhandler.py b/bigml/api_handlers/batchtopicdistributionhandler.py index 0f09a94a..2a1bd204 100644 --- a/bigml/api_handlers/batchtopicdistributionhandler.py +++ b/bigml/api_handlers/batchtopicdistributionhandler.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=abstract-method # -# Copyright 2016-2023 BigML +# Copyright 2016-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/api_handlers/centroidhandler.py b/bigml/api_handlers/centroidhandler.py index 10a836ae..d0455649 100644 --- a/bigml/api_handlers/centroidhandler.py +++ b/bigml/api_handlers/centroidhandler.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=abstract-method # -# Copyright 2014-2023 BigML +# Copyright 2014-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/api_handlers/clusterhandler.py b/bigml/api_handlers/clusterhandler.py index 1511a37b..ffc833eb 100644 --- a/bigml/api_handlers/clusterhandler.py +++ b/bigml/api_handlers/clusterhandler.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=abstract-method # -# Copyright 2014-2023 BigML +# Copyright 2014-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/api_handlers/configurationhandler.py b/bigml/api_handlers/configurationhandler.py index 12a28a96..4e2e1ae1 100644 --- a/bigml/api_handlers/configurationhandler.py +++ b/bigml/api_handlers/configurationhandler.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=abstract-method # -# Copyright 2014-2023 BigML +# Copyright 2014-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/api_handlers/correlationhandler.py b/bigml/api_handlers/correlationhandler.py index ab923aab..29fedc23 100644 --- a/bigml/api_handlers/correlationhandler.py +++ b/bigml/api_handlers/correlationhandler.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=abstract-method # -# Copyright 2015-2023 BigML +# Copyright 2015-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/api_handlers/datasethandler.py b/bigml/api_handlers/datasethandler.py index 656158e8..04ac3ec6 100644 --- a/bigml/api_handlers/datasethandler.py +++ b/bigml/api_handlers/datasethandler.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=abstract-method # -# Copyright 2014-2023 BigML +# Copyright 2014-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/api_handlers/deepnethandler.py b/bigml/api_handlers/deepnethandler.py index 85d91485..ff966793 100644 --- a/bigml/api_handlers/deepnethandler.py +++ b/bigml/api_handlers/deepnethandler.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=abstract-method # -# Copyright 2017-2023 BigML +# Copyright 2017-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/api_handlers/ensemblehandler.py b/bigml/api_handlers/ensemblehandler.py index e03b96fa..6ebd035e 100644 --- a/bigml/api_handlers/ensemblehandler.py +++ b/bigml/api_handlers/ensemblehandler.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=abstract-method # -# Copyright 2014-2023 BigML +# Copyright 2014-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/api_handlers/evaluationhandler.py b/bigml/api_handlers/evaluationhandler.py index 37a9fe5b..82b224d4 100644 --- a/bigml/api_handlers/evaluationhandler.py +++ b/bigml/api_handlers/evaluationhandler.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=abstract-method # -# Copyright 2014-2023 BigML +# Copyright 2014-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/api_handlers/executionhandler.py b/bigml/api_handlers/executionhandler.py index 436e64e1..2fbf6f7e 100644 --- a/bigml/api_handlers/executionhandler.py +++ b/bigml/api_handlers/executionhandler.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=abstract-method # -# Copyright 2015-2023 BigML +# Copyright 2015-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/api_handlers/externalconnectorhandler.py b/bigml/api_handlers/externalconnectorhandler.py index 9a11de46..7d33a58e 100644 --- a/bigml/api_handlers/externalconnectorhandler.py +++ b/bigml/api_handlers/externalconnectorhandler.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=abstract-method # -# Copyright 2020-2023 BigML +# Copyright 2020-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/api_handlers/forecasthandler.py b/bigml/api_handlers/forecasthandler.py index f09f2834..cfaba279 100644 --- a/bigml/api_handlers/forecasthandler.py +++ b/bigml/api_handlers/forecasthandler.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=abstract-method # -# Copyright 2017-2023 BigML +# Copyright 2017-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/api_handlers/fusionhandler.py b/bigml/api_handlers/fusionhandler.py index 315fa907..90e22ee7 100644 --- a/bigml/api_handlers/fusionhandler.py +++ b/bigml/api_handlers/fusionhandler.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=abstract-method # -# Copyright 2018-2023 BigML +# Copyright 2018-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/api_handlers/libraryhandler.py b/bigml/api_handlers/libraryhandler.py index 18ee3a8e..36055eee 100644 --- a/bigml/api_handlers/libraryhandler.py +++ b/bigml/api_handlers/libraryhandler.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=abstract-method # -# Copyright 2015-2023 BigML +# Copyright 2015-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/api_handlers/linearhandler.py b/bigml/api_handlers/linearhandler.py index 4804fd51..3f24a5f8 100644 --- a/bigml/api_handlers/linearhandler.py +++ b/bigml/api_handlers/linearhandler.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=abstract-method # -# Copyright 2019-2023 BigML +# Copyright 2019-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/api_handlers/logistichandler.py b/bigml/api_handlers/logistichandler.py index 5d00754a..744422bf 100644 --- a/bigml/api_handlers/logistichandler.py +++ b/bigml/api_handlers/logistichandler.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=abstract-method # -# Copyright 2015-2023 BigML +# Copyright 2015-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/api_handlers/modelhandler.py b/bigml/api_handlers/modelhandler.py index f2aee4f0..0a94d342 100644 --- a/bigml/api_handlers/modelhandler.py +++ b/bigml/api_handlers/modelhandler.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=abstract-method # -# Copyright 2014-2023 BigML +# Copyright 2014-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/api_handlers/optimlhandler.py b/bigml/api_handlers/optimlhandler.py index 1f1e5f99..cd5853d5 100644 --- a/bigml/api_handlers/optimlhandler.py +++ b/bigml/api_handlers/optimlhandler.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=abstract-method # -# Copyright 2018-2023 BigML +# Copyright 2018-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/api_handlers/pcahandler.py b/bigml/api_handlers/pcahandler.py index 207591c7..933d73da 100644 --- a/bigml/api_handlers/pcahandler.py +++ b/bigml/api_handlers/pcahandler.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=abstract-method # -# Copyright 2018-2023 BigML +# Copyright 2018-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/api_handlers/predictionhandler.py b/bigml/api_handlers/predictionhandler.py index 96d2f6db..c2c160b2 100644 --- a/bigml/api_handlers/predictionhandler.py +++ b/bigml/api_handlers/predictionhandler.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=abstract-method # -# Copyright 2014-2023 BigML +# Copyright 2014-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/api_handlers/projecthandler.py b/bigml/api_handlers/projecthandler.py index a4d17d2b..3c3b7a51 100644 --- a/bigml/api_handlers/projecthandler.py +++ b/bigml/api_handlers/projecthandler.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=abstract-method # -# Copyright 2014-2023 BigML +# Copyright 2014-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/api_handlers/projectionhandler.py b/bigml/api_handlers/projectionhandler.py index 4b227947..d463fca8 100644 --- a/bigml/api_handlers/projectionhandler.py +++ b/bigml/api_handlers/projectionhandler.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=abstract-method # -# Copyright 2018-2023 BigML +# Copyright 2018-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/api_handlers/resourcehandler.py b/bigml/api_handlers/resourcehandler.py index caef0e99..524f53ef 100644 --- a/bigml/api_handlers/resourcehandler.py +++ b/bigml/api_handlers/resourcehandler.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=abstract-method,unused-import # -# Copyright 2014-2023 BigML +# Copyright 2014-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/api_handlers/samplehandler.py b/bigml/api_handlers/samplehandler.py index 53861ae2..d50baf0b 100644 --- a/bigml/api_handlers/samplehandler.py +++ b/bigml/api_handlers/samplehandler.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=abstract-method # -# Copyright 2015-2023 BigML +# Copyright 2015-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/api_handlers/scripthandler.py b/bigml/api_handlers/scripthandler.py index c012d985..d03ed771 100644 --- a/bigml/api_handlers/scripthandler.py +++ b/bigml/api_handlers/scripthandler.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=abstract-method # -# Copyright 2015-2023 BigML +# Copyright 2015-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/api_handlers/sourcehandler.py b/bigml/api_handlers/sourcehandler.py index ac897413..bd4b6e6b 100644 --- a/bigml/api_handlers/sourcehandler.py +++ b/bigml/api_handlers/sourcehandler.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=abstract-method # -# Copyright 2014-2023 BigML +# Copyright 2014-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain @@ -24,6 +24,8 @@ import sys import os import numbers +import time +import logging from urllib import parse @@ -59,17 +61,17 @@ HTTP_CREATED, HTTP_BAD_REQUEST, HTTP_UNAUTHORIZED, HTTP_PAYMENT_REQUIRED, HTTP_NOT_FOUND, HTTP_TOO_MANY_REQUESTS, - HTTP_INTERNAL_SERVER_ERROR, GAE_ENABLED, SEND_JSON) + HTTP_INTERNAL_SERVER_ERROR, GAE_ENABLED, SEND_JSON, LOGGER) from bigml.bigmlconnection import json_load from bigml.api_handlers.resourcehandler import check_resource_type, \ resource_is_ready, get_source_id, get_id from bigml.constants import SOURCE_PATH, IMAGE_EXTENSIONS -from bigml.api_handlers.resourcehandler import ResourceHandlerMixin, LOGGER +from bigml.api_handlers.resourcehandler import ResourceHandlerMixin from bigml.fields import Fields -MAX_CHANGES = 500 - +MAX_CHANGES = 5 +MAX_RETRIES = 5 def compact_regions(regions): """Returns the list of regions in the compact value used for updates """ @@ -508,6 +510,8 @@ def update_composite_annotations(self, source, images_file, try: _ = file_list.index(filename) except ValueError: + LOGGER.error("WARNING: Could not find annotated file (%s)" + " in the composite's sources list", filename) continue for key in annotation.keys(): if key == "file": @@ -539,9 +543,12 @@ def update_composite_annotations(self, source, images_file, "components": source_ids}) elif optype == "regions": for value, source_id in values: + if isinstance(value, list): + # dictionary should contain the bigml-coco format + value = compact_regions(value) changes.append( {"field": field, - "value": compact_regions(value), + "value": value, "components": [source_id]}) else: for value, source_id in values: @@ -550,15 +557,36 @@ def update_composite_annotations(self, source, images_file, "value": value, "components": [source_id]}) except Exception: + LOGGER.error("WARNING: Problem adding annotation to %s (%s)", + field, values) pass # we need to limit the amount of changes per update - for offset in range(0, int(len(changes) / MAX_CHANGES) + 1): - new_batch = changes[offset: offset + MAX_CHANGES] + batches_number = int(len(changes) / MAX_CHANGES) + for offset in range(0, batches_number + 1): + new_batch = changes[ + offset * MAX_CHANGES: (offset + 1) * MAX_CHANGES] if new_batch: source = self.update_source(source, {"row_values": new_batch}) - self.ok(source) + counter = 0 + while source["error"] is not None and counter < MAX_RETRIES: + # retrying in case update is temporarily unavailable + counter += 1 + time.sleep(counter) + source = self.get_source(source) + self.ok(source) + source = self.update_source(source, + {"row_values": new_batch}) + if source["error"] is not None: + err_str = json.dumps(source["error"]) + v_str = json.dumps(new_batch) + LOGGER.error("WARNING: Some annotations were not updated " + f" (error: {err_str}, values: {v_str})") + if not self.ok(source): + raise Exception( + f"Failed to update {len(new_batch)} annotations.") + time.sleep(0.1) return source diff --git a/bigml/api_handlers/statisticaltesthandler.py b/bigml/api_handlers/statisticaltesthandler.py index 6bd72330..eca91255 100644 --- a/bigml/api_handlers/statisticaltesthandler.py +++ b/bigml/api_handlers/statisticaltesthandler.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=abstract-method # -# Copyright 2015-2023 BigML +# Copyright 2015-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/api_handlers/timeserieshandler.py b/bigml/api_handlers/timeserieshandler.py index ae68eb2d..2d57a08c 100644 --- a/bigml/api_handlers/timeserieshandler.py +++ b/bigml/api_handlers/timeserieshandler.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=abstract-method # -# Copyright 2017-2023 BigML +# Copyright 2017-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/api_handlers/topicdistributionhandler.py b/bigml/api_handlers/topicdistributionhandler.py index b7fad37c..117cefd2 100644 --- a/bigml/api_handlers/topicdistributionhandler.py +++ b/bigml/api_handlers/topicdistributionhandler.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=abstract-method # -# Copyright 2016-2023 BigML +# Copyright 2016-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/api_handlers/topicmodelhandler.py b/bigml/api_handlers/topicmodelhandler.py index 78dca0a5..a34b904b 100644 --- a/bigml/api_handlers/topicmodelhandler.py +++ b/bigml/api_handlers/topicmodelhandler.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=abstract-method # -# Copyright 2016-2023 BigML +# Copyright 2016-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/association.py b/bigml/association.py index 6cc50f22..a3b65d76 100644 --- a/bigml/association.py +++ b/bigml/association.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright 2015-2023 BigML +# Copyright 2015-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/associationrule.py b/bigml/associationrule.py index c7f82835..63944342 100644 --- a/bigml/associationrule.py +++ b/bigml/associationrule.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright 2015-2023 BigML +# Copyright 2015-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/basemodel.py b/bigml/basemodel.py index c4d380a1..0c22dc54 100644 --- a/bigml/basemodel.py +++ b/bigml/basemodel.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright 2013-2023 BigML +# Copyright 2013-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/bigmlconnection.py b/bigml/bigmlconnection.py index f47e6b32..1e680915 100644 --- a/bigml/bigmlconnection.py +++ b/bigml/bigmlconnection.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright 2014-2023 BigML +# Copyright 2014-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain @@ -50,7 +50,7 @@ LOG_FORMAT = '%(asctime)-15s: %(message)s' LOGGER = logging.getLogger('BigML') -CONSOLE = logging.StreamHandler() +CONSOLE = logging.StreamHandler(sys.stdout) CONSOLE.setLevel(logging.WARNING) LOGGER.addHandler(CONSOLE) @@ -138,7 +138,7 @@ def debug_request(method, url, **kwargs): """ response = original_request(method, url, **kwargs) - logging.debug("Data: %s", response.request.body) + LOGGER.debug("Data: %s", response.request.body) try: response_content = "Download status is %s" % response.status_code \ if "download" in url else \ @@ -147,7 +147,7 @@ def debug_request(method, url, **kwargs): response_content = response.content response_content = response_content[0: 256] if short_debug else \ response_content - logging.debug("Response: %s\n", response_content) + LOGGER.debug("Response: %s\n", response_content) return response original_request = requests.api.request @@ -213,9 +213,8 @@ def __init__(self, username=None, api_key=None, # when using GAE will fail pass - logging.basicConfig(format=LOG_FORMAT, - level=logging_level, - stream=sys.stdout) + LOGGER.forma = LOG_FORMAT, + LOGGER.level = logging_level if username is None: try: @@ -406,7 +405,7 @@ def _create(self, url, body, verify=None, organization=None): error = json_load(response.content) LOGGER.error(self.error_message(error, method='create')) elif code != HTTP_ACCEPTED: - LOGGER.error("Unexpected error (%s)", code) + LOGGER.error("CREATE Unexpected error (%s)", code) code = HTTP_INTERNAL_SERVER_ERROR except ValueError as exc: LOGGER.error("Malformed response: %s", str(exc)) @@ -489,7 +488,7 @@ def _get(self, url, query_string='', LOGGER.error(self.error_message(error, method='get', resource_id=resource_id)) else: - LOGGER.error("Unexpected error (%s)", code) + LOGGER.error("GET Unexpected error (%s)", code) code = HTTP_INTERNAL_SERVER_ERROR except ValueError as exc: @@ -582,7 +581,7 @@ def _list(self, url, query_string='', organization=None): HTTP_TOO_MANY_REQUESTS]: error = json_load(response.content) else: - LOGGER.error("Unexpected error (%s)", code) + LOGGER.error("LIST Unexpected error (%s)", code) code = HTTP_INTERNAL_SERVER_ERROR except ValueError as exc: LOGGER.error("Malformed response: %s", str(exc)) @@ -650,7 +649,6 @@ def _update(self, url, body, organization=None, resource_id=None): location, resource, error) try: code = response.status_code - if code == HTTP_ACCEPTED: resource = json_load(response.content) resource_id = resource['resource'] @@ -663,7 +661,7 @@ def _update(self, url, body, organization=None, resource_id=None): LOGGER.error(self.error_message(error, method='update', resource_id=resource_id)) else: - LOGGER.error("Unexpected error (%s)", code) + LOGGER.error("UPDATE Unexpected error (%s)", code) code = HTTP_INTERNAL_SERVER_ERROR except ValueError: LOGGER.error("Malformed response") diff --git a/bigml/centroid.py b/bigml/centroid.py index 4e5f30c0..534cb562 100644 --- a/bigml/centroid.py +++ b/bigml/centroid.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright 2014-2023 BigML +# Copyright 2014-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/cluster.py b/bigml/cluster.py index 040c108b..5739554b 100644 --- a/bigml/cluster.py +++ b/bigml/cluster.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright 2014-2023 BigML +# Copyright 2014-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/constants.py b/bigml/constants.py index 6423faff..5171d557 100644 --- a/bigml/constants.py +++ b/bigml/constants.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright 2015-2023 BigML +# Copyright 2015-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/dataset.py b/bigml/dataset.py index 280c285d..5c548e61 100644 --- a/bigml/dataset.py +++ b/bigml/dataset.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright 2022-2023 BigML +# Copyright 2022-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain @@ -19,14 +19,13 @@ """ import os -import logging -import warnings import subprocess from bigml.fields import Fields, sorted_headers, get_new_fields from bigml.api import get_api_connection, get_dataset_id, get_status from bigml.basemodel import get_resource_dict -from bigml.util import DEFAULT_LOCALE, use_cache, cast, load, dump, dumps +from bigml.util import DEFAULT_LOCALE, use_cache, cast, load, dump, dumps, \ + sensenet_logging from bigml.constants import FINISHED from bigml.flatline import Flatline from bigml.featurizer import Featurizer @@ -40,12 +39,9 @@ #pylint: disable=locally-disabled,bare-except,ungrouped-imports try: - # avoiding tensorflow info logging - warnings.filterwarnings("ignore", category=DeprecationWarning) - os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' - import tensorflow as tf - tf.get_logger().setLevel('ERROR') - tf.autograph.set_verbosity(0) + # bigml-sensenet should be installed for image processing + sensenet_logging() + import sensenet from bigml.images.featurizers import ImageFeaturizer as Featurizer except: pass diff --git a/bigml/deepnet.py b/bigml/deepnet.py index 75eed911..dbb45dc9 100644 --- a/bigml/deepnet.py +++ b/bigml/deepnet.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=wrong-import-position,ungrouped-imports # -# Copyright 2017-2023 BigML +# Copyright 2017-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain @@ -40,7 +40,6 @@ deepnet.predict({"petal length": 3, "petal width": 1}) """ -import logging import os import warnings @@ -49,37 +48,26 @@ from bigml.api import FINISHED from bigml.api import get_status, get_api_connection, get_deepnet_id from bigml.util import cast, use_cache, load, get_data_transformations, \ - PRECISION + PRECISION, sensenet_logging from bigml.basemodel import get_resource_dict, extract_objective from bigml.modelfields import ModelFields from bigml.laminar.constants import NUMERIC from bigml.model import parse_operating_point, sort_categories from bigml.constants import REGIONS, REGIONS_OPERATION_SETTINGS, \ DEFAULT_OPERATION_SETTINGS, REGION_SCORE_ALIAS, REGION_SCORE_THRESHOLD, \ - IMAGE, DECIMALS + IMAGE, DECIMALS, IOU_REMOTE_SETTINGS import bigml.laminar.numpy_ops as net import bigml.laminar.preprocess_np as pp try: - # avoiding tensorflow info logging - warnings.filterwarnings("ignore", category=DeprecationWarning) - os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' - import tensorflow as tf - tf.get_logger().setLevel('ERROR') - tf.autograph.set_verbosity(0) - LAMINAR_VERSION = False -except Exception: - LAMINAR_VERSION = True - -try: + sensenet_logging() from sensenet.models.wrappers import create_model from bigml.images.utils import to_relative_coordinates - from bigml.constants import IOU_REMOTE_SETTINGS + LAMINAR_VERSION = False except Exception: LAMINAR_VERSION = True -LOGGER = logging.getLogger('BigML') MEAN = "mean" STANDARD_DEVIATION = "stdev" diff --git a/bigml/domain.py b/bigml/domain.py index d36b4194..81a26ebc 100644 --- a/bigml/domain.py +++ b/bigml/domain.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright 2014-2023 BigML +# Copyright 2014-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/ensemble.py b/bigml/ensemble.py index 51e8b50e..94c96a77 100644 --- a/bigml/ensemble.py +++ b/bigml/ensemble.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright 2012-2023 BigML +# Copyright 2012-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain @@ -214,7 +214,6 @@ def __init__(self, ensemble, api=None, max_models=None, cache_get=None, # avoid checking fields because of old ensembles ensemble = retrieve_resource(self.api, self.resource_id, no_check_fields=True) - self.parent_id = ensemble.get('object', {}).get('dataset') self.name = ensemble.get('object', {}).get('name') self.description = ensemble.get('object', {}).get('description') diff --git a/bigml/ensemblepredictor.py b/bigml/ensemblepredictor.py index d266805b..cab2fbdd 100644 --- a/bigml/ensemblepredictor.py +++ b/bigml/ensemblepredictor.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright 2017-2023 BigML +# Copyright 2017-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/evaluation.py b/bigml/evaluation.py index bd79cc9a..76726589 100644 --- a/bigml/evaluation.py +++ b/bigml/evaluation.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright 2023 BigML +# Copyright 2023-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/exceptions.py b/bigml/exceptions.py index 77630bab..71e965f6 100644 --- a/bigml/exceptions.py +++ b/bigml/exceptions.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright 2021-2023 BigML +# Copyright 2021-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/execution.py b/bigml/execution.py index f4af728e..626cd06e 100644 --- a/bigml/execution.py +++ b/bigml/execution.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright 2019-2023 BigML +# Copyright 2019-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/featurizer.py b/bigml/featurizer.py index e39d754f..0a6d9e33 100644 --- a/bigml/featurizer.py +++ b/bigml/featurizer.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright 2022-2023 BigML +# Copyright 2022-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/fields.py b/bigml/fields.py index 62023258..41246b62 100644 --- a/bigml/fields.py +++ b/bigml/fields.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=unbalanced-tuple-unpacking # -# Copyright 2012-2023 BigML +# Copyright 2012-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/flatline.py b/bigml/flatline.py index 7efba23a..ee18536a 100644 --- a/bigml/flatline.py +++ b/bigml/flatline.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright 2022-2023 BigML +# Copyright 2022-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/flattree.py b/bigml/flattree.py index cc190204..021d52d6 100644 --- a/bigml/flattree.py +++ b/bigml/flattree.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright 2019-2023 BigML +# Copyright 2019-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/fusion.py b/bigml/fusion.py index 3b88481c..c7ce7425 100644 --- a/bigml/fusion.py +++ b/bigml/fusion.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright 2012-2023 BigML +# Copyright 2012-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/generators/boosted_tree.py b/bigml/generators/boosted_tree.py index 96fb4a28..14bbf2be 100644 --- a/bigml/generators/boosted_tree.py +++ b/bigml/generators/boosted_tree.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright 2020-2023 BigML +# Copyright 2020-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/generators/model.py b/bigml/generators/model.py index 16be16c2..51c65e92 100644 --- a/bigml/generators/model.py +++ b/bigml/generators/model.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright 2020-2023 BigML +# Copyright 2020-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain @@ -135,8 +135,9 @@ def get_leaves(model, path=None, filter_function=None): offsets = model.offsets - def get_tree_leaves(tree, fields, path, leaves, filter_function=None): + def get_tree_leaves(tree, fields, path, filter_function=None): + leaves = [] node = get_node(tree) predicate = get_predicate(tree) if isinstance(predicate, list): @@ -149,8 +150,9 @@ def get_tree_leaves(tree, fields, path, leaves, filter_function=None): if children: for child in children: + leaves += get_tree_leaves(child, fields, - path[:], leaves, + path[:], filter_function=filter_function) else: leaf = { @@ -171,7 +173,7 @@ def get_tree_leaves(tree, fields, path, leaves, filter_function=None): or filter_function(leaf)): leaves += [leaf] return leaves - return get_tree_leaves(model.tree, model.fields, path, leaves, + return get_tree_leaves(model.tree, model.fields, path, filter_function) diff --git a/bigml/generators/tree.py b/bigml/generators/tree.py index 086f2108..95d7200e 100644 --- a/bigml/generators/tree.py +++ b/bigml/generators/tree.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright 2020-2023 BigML +# Copyright 2020-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/generators/tree_common.py b/bigml/generators/tree_common.py index 64596627..4a46b8e6 100644 --- a/bigml/generators/tree_common.py +++ b/bigml/generators/tree_common.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright 2020-2023 BigML +# Copyright 2020-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/images/featurizers.py b/bigml/images/featurizers.py index c2b5ed64..d6919ed1 100644 --- a/bigml/images/featurizers.py +++ b/bigml/images/featurizers.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=invalid-name # -# Copyright 2022-2023 BigML +# Copyright 2022-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/images/utils.py b/bigml/images/utils.py index 1e0a10ba..26378deb 100644 --- a/bigml/images/utils.py +++ b/bigml/images/utils.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright 2022-2023 BigML +# Copyright 2022-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/io.py b/bigml/io.py index b6b878e3..c9dc0a20 100644 --- a/bigml/io.py +++ b/bigml/io.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # pylint: disable=R1732 # -# Copyright (c) 2015-2023 BigML, Inc +# Copyright (c) 2015-2025 BigML, Inc # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain @@ -19,7 +19,7 @@ """Python I/O functions. :author: jao -:date: Wed Apr 08, 2015-2023 17:52 +:date: Wed Apr 08, 2015-2025 17:52 """ diff --git a/bigml/item.py b/bigml/item.py index 3138a0e9..3314507a 100644 --- a/bigml/item.py +++ b/bigml/item.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright 2015-2023 BigML +# Copyright 2015-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/laminar/numpy_ops.py b/bigml/laminar/numpy_ops.py index 805850f3..85c21ea4 100644 --- a/bigml/laminar/numpy_ops.py +++ b/bigml/laminar/numpy_ops.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=invalid-name,missing-function-docstring # -# Copyright 2017-2023 BigML +# Copyright 2017-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/laminar/preprocess_np.py b/bigml/laminar/preprocess_np.py index c2d6cb9b..95e64899 100644 --- a/bigml/laminar/preprocess_np.py +++ b/bigml/laminar/preprocess_np.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=invalid-name,missing-function-docstring # -# Copyright 2017-2023 BigML +# Copyright 2017-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/linear.py b/bigml/linear.py index 80522948..c6e00407 100644 --- a/bigml/linear.py +++ b/bigml/linear.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright 2015-2023 BigML +# Copyright 2015-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/local_model.py b/bigml/local_model.py index 8b545ef1..c8ed68c9 100644 --- a/bigml/local_model.py +++ b/bigml/local_model.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # pylint: disable=super-init-not-called # -# Copyright 2023 BigML +# Copyright 2023-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/logistic.py b/bigml/logistic.py index 691199b9..67199512 100644 --- a/bigml/logistic.py +++ b/bigml/logistic.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright 2015-2023 BigML +# Copyright 2015-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/model.py b/bigml/model.py index 47b3f56c..560d5c37 100644 --- a/bigml/model.py +++ b/bigml/model.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright 2013-2023 BigML +# Copyright 2013-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/modelfields.py b/bigml/modelfields.py index 2eb9b38a..964015f0 100644 --- a/bigml/modelfields.py +++ b/bigml/modelfields.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright 2013-2023 BigML +# Copyright 2013-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/multimodel.py b/bigml/multimodel.py index bc1243eb..85e7eb9e 100644 --- a/bigml/multimodel.py +++ b/bigml/multimodel.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright 2012-2023 BigML +# Copyright 2012-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/multivote.py b/bigml/multivote.py index 9508c835..873e79aa 100644 --- a/bigml/multivote.py +++ b/bigml/multivote.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=dangerous-default-value # -# Copyright 2012-2023 BigML +# Copyright 2012-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/multivotelist.py b/bigml/multivotelist.py index ee604e39..72f2cb56 100644 --- a/bigml/multivotelist.py +++ b/bigml/multivotelist.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright 2017-2023 BigML +# Copyright 2017-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/path.py b/bigml/path.py index 6f533fc7..e85a2ac3 100644 --- a/bigml/path.py +++ b/bigml/path.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright 2015-2023 BigML +# Copyright 2015-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/pca.py b/bigml/pca.py index 10d37827..22eb37c8 100644 --- a/bigml/pca.py +++ b/bigml/pca.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright 2018-2023 BigML +# Copyright 2018-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/pipeline/pipeline.py b/bigml/pipeline/pipeline.py index 83c62337..20cbb8b9 100644 --- a/bigml/pipeline/pipeline.py +++ b/bigml/pipeline/pipeline.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=locally-disabled,cyclic-import # -# Copyright 2022-2023 BigML +# Copyright 2022-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/pipeline/transformer.py b/bigml/pipeline/transformer.py index d099bfa4..3b983cd8 100644 --- a/bigml/pipeline/transformer.py +++ b/bigml/pipeline/transformer.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright 2022-2023 BigML +# Copyright 2022-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/predicate.py b/bigml/predicate.py index 29ac5068..ed6ec690 100644 --- a/bigml/predicate.py +++ b/bigml/predicate.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright 2013-2023 BigML +# Copyright 2013-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/predicate_utils/utils.py b/bigml/predicate_utils/utils.py index 8f3801b0..7239d01e 100644 --- a/bigml/predicate_utils/utils.py +++ b/bigml/predicate_utils/utils.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright 2020-2023 BigML +# Copyright 2020-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/predicates.py b/bigml/predicates.py index 36abd4a9..54537858 100644 --- a/bigml/predicates.py +++ b/bigml/predicates.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright 2014-2023 BigML +# Copyright 2014-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/predict_utils/boosting.py b/bigml/predict_utils/boosting.py index 89b10108..1380e96d 100644 --- a/bigml/predict_utils/boosting.py +++ b/bigml/predict_utils/boosting.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright 2020-2023 BigML +# Copyright 2020-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/predict_utils/classification.py b/bigml/predict_utils/classification.py index 56a3a013..862b32c7 100644 --- a/bigml/predict_utils/classification.py +++ b/bigml/predict_utils/classification.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright 2020-2023 BigML +# Copyright 2020-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/predict_utils/common.py b/bigml/predict_utils/common.py index e2837594..6b967f52 100644 --- a/bigml/predict_utils/common.py +++ b/bigml/predict_utils/common.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright 2020-2023 BigML +# Copyright 2020-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/predict_utils/regression.py b/bigml/predict_utils/regression.py index 82371234..4c291f05 100644 --- a/bigml/predict_utils/regression.py +++ b/bigml/predict_utils/regression.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright 2020-2023 BigML +# Copyright 2020-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/prediction.py b/bigml/prediction.py index a284575b..19327510 100644 --- a/bigml/prediction.py +++ b/bigml/prediction.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright 2015-2023 BigML +# Copyright 2015-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/shapwrapper.py b/bigml/shapwrapper.py index 659c45aa..65586ca2 100644 --- a/bigml/shapwrapper.py +++ b/bigml/shapwrapper.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # pylint: disable=super-init-not-called # -# Copyright 2023 BigML +# Copyright 2023-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/supervised.py b/bigml/supervised.py index d10da99e..57155fa8 100644 --- a/bigml/supervised.py +++ b/bigml/supervised.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # pylint: disable=super-init-not-called # -# Copyright 2018-2023 BigML +# Copyright 2018-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/compare_dataset_steps.py b/bigml/tests/compare_dataset_steps.py index f3293f9f..04bc9110 100644 --- a/bigml/tests/compare_dataset_steps.py +++ b/bigml/tests/compare_dataset_steps.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=locally-disabled,unused-argument,no-member # -# Copyright 2022-2023 BigML +# Copyright 2022-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/compare_forecasts_steps.py b/bigml/tests/compare_forecasts_steps.py index af10b02d..0d4fe85a 100644 --- a/bigml/tests/compare_forecasts_steps.py +++ b/bigml/tests/compare_forecasts_steps.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -# Copyright 2017-2023 BigML +# Copyright 2017-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/compare_pipeline_steps.py b/bigml/tests/compare_pipeline_steps.py index 4007aef6..146ea408 100644 --- a/bigml/tests/compare_pipeline_steps.py +++ b/bigml/tests/compare_pipeline_steps.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=locally-disabled,unused-argument,no-member # -# Copyright 2022-2023 BigML +# Copyright 2022-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/compare_predictions_steps.py b/bigml/tests/compare_predictions_steps.py index 66f1bc23..b0019411 100644 --- a/bigml/tests/compare_predictions_steps.py +++ b/bigml/tests/compare_predictions_steps.py @@ -2,7 +2,7 @@ #pylint: disable=locally-disabled,unused-argument,no-member #pylint: disable=locally-disabled,pointless-string-statement # -# Copyright 2012-2023 BigML +# Copyright 2012-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/compute_lda_prediction_steps.py b/bigml/tests/compute_lda_prediction_steps.py index 4479057b..5ec5f6e8 100644 --- a/bigml/tests/compute_lda_prediction_steps.py +++ b/bigml/tests/compute_lda_prediction_steps.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=locally-disabled,unused-argument,no-member # -# Copyright 2016-2023 BigML +# Copyright 2016-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/compute_multivote_prediction_steps.py b/bigml/tests/compute_multivote_prediction_steps.py index e18f754c..251423c1 100644 --- a/bigml/tests/compute_multivote_prediction_steps.py +++ b/bigml/tests/compute_multivote_prediction_steps.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=locally-disabled,unused-argument,no-member # -# Copyright 2012, 2015-2023 BigML +# Copyright 2012, 2015-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/create_anomaly_steps.py b/bigml/tests/create_anomaly_steps.py index d0fecedd..f0b18d3a 100644 --- a/bigml/tests/create_anomaly_steps.py +++ b/bigml/tests/create_anomaly_steps.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=locally-disabled,unused-argument,no-member # -# Copyright 2014-2023 BigML +# Copyright 2014-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/create_association_steps.py b/bigml/tests/create_association_steps.py index 2b56fceb..b54cd9be 100644 --- a/bigml/tests/create_association_steps.py +++ b/bigml/tests/create_association_steps.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=locally-disabled,unused-argument,no-member # -# Copyright 2014-2023 BigML +# Copyright 2014-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/create_batch_prediction_steps.py b/bigml/tests/create_batch_prediction_steps.py index 98bbc4d4..7988a3f9 100644 --- a/bigml/tests/create_batch_prediction_steps.py +++ b/bigml/tests/create_batch_prediction_steps.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=locally-disabled,unused-argument,no-member # -# Copyright 2012-2023 BigML +# Copyright 2012-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/create_batch_projection_steps.py b/bigml/tests/create_batch_projection_steps.py index 9dcb8d3d..d18debf7 100644 --- a/bigml/tests/create_batch_projection_steps.py +++ b/bigml/tests/create_batch_projection_steps.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=locally-disabled,unused-argument,no-member # -# Copyright 2018-2023 BigML +# Copyright 2018-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/create_cluster_steps.py b/bigml/tests/create_cluster_steps.py index 1a993a40..f6c9e002 100644 --- a/bigml/tests/create_cluster_steps.py +++ b/bigml/tests/create_cluster_steps.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=locally-disabled,unused-argument,no-member # -# Copyright 2012-2023 BigML +# Copyright 2012-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/create_configuration_steps.py b/bigml/tests/create_configuration_steps.py index f657be7e..5116986d 100644 --- a/bigml/tests/create_configuration_steps.py +++ b/bigml/tests/create_configuration_steps.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=locally-disabled,unused-argument,no-member # -# Copyright 2017-2023 BigML +# Copyright 2017-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/create_correlation_steps.py b/bigml/tests/create_correlation_steps.py index e3c03894..c5421c6b 100644 --- a/bigml/tests/create_correlation_steps.py +++ b/bigml/tests/create_correlation_steps.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=locally-disabled,unused-argument,no-member # -# Copyright 2015-2023 BigML +# Copyright 2015-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/create_dataset_steps.py b/bigml/tests/create_dataset_steps.py index 0baf4a85..b341ba51 100644 --- a/bigml/tests/create_dataset_steps.py +++ b/bigml/tests/create_dataset_steps.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=locally-disabled,unused-argument,no-member # -# Copyright 2012-2023 BigML +# Copyright 2012-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain @@ -222,3 +222,16 @@ def clone_dataset(step, dataset): def the_cloned_dataset_is(step, dataset): """Checking the dataset is a clone""" eq_(world.dataset["origin"], dataset) + + +def check_annotations(step, annotations_field, annotations_num): + """Checking the dataset contains a number of annotations""" + annotations_num = int(annotations_num) + field = world.dataset["fields"][annotations_field] + if field["optype"] == "regions": + count = field["summary"]["regions"]["sum"] + else: + count = 0 + for _, num in field["summary"]["categories"]: + count += num + eq_(count, annotations_num) diff --git a/bigml/tests/create_ensemble_steps.py b/bigml/tests/create_ensemble_steps.py index 5d9c098a..7113dfde 100644 --- a/bigml/tests/create_ensemble_steps.py +++ b/bigml/tests/create_ensemble_steps.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=locally-disabled,unused-argument,no-member,broad-except # -# Copyright 2012-2023 BigML +# Copyright 2012-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/create_evaluation_steps.py b/bigml/tests/create_evaluation_steps.py index 0a9756b5..c7412a38 100644 --- a/bigml/tests/create_evaluation_steps.py +++ b/bigml/tests/create_evaluation_steps.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=locally-disabled,unused-argument,no-member # -# Copyright 2012, 2015-2023 BigML +# Copyright 2012, 2015-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/create_execution_steps.py b/bigml/tests/create_execution_steps.py index de478629..6d4d69a6 100644 --- a/bigml/tests/create_execution_steps.py +++ b/bigml/tests/create_execution_steps.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=locally-disabled,unused-argument,no-member # -# Copyright 2015-2023 BigML +# Copyright 2015-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/create_external_steps.py b/bigml/tests/create_external_steps.py index 06a48425..08bb6f22 100644 --- a/bigml/tests/create_external_steps.py +++ b/bigml/tests/create_external_steps.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=locally-disabled,unused-argument,no-member # -# Copyright 2020-2023 BigML +# Copyright 2020-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/create_forecast_steps.py b/bigml/tests/create_forecast_steps.py index e6999d3a..15a922b8 100644 --- a/bigml/tests/create_forecast_steps.py +++ b/bigml/tests/create_forecast_steps.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=locally-disabled,unused-argument,no-member # -# Copyright 2017-2023 BigML +# Copyright 2017-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/create_lda_steps.py b/bigml/tests/create_lda_steps.py index 597eab4f..cd06ac96 100644 --- a/bigml/tests/create_lda_steps.py +++ b/bigml/tests/create_lda_steps.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=locally-disabled,unused-argument,no-member # -# Copyright 2012-2023 BigML +# Copyright 2012-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/create_library_steps.py b/bigml/tests/create_library_steps.py index 77a37aca..dd8cb5d2 100644 --- a/bigml/tests/create_library_steps.py +++ b/bigml/tests/create_library_steps.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=locally-disabled,unused-argument,no-member # -# Copyright 2015-2023 BigML +# Copyright 2015-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/create_linear_steps.py b/bigml/tests/create_linear_steps.py index b62c41d7..88fae1b9 100644 --- a/bigml/tests/create_linear_steps.py +++ b/bigml/tests/create_linear_steps.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=locally-disabled,unused-argument,no-member # -# Copyright 2019-2023 BigML +# Copyright 2019-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/create_model_steps.py b/bigml/tests/create_model_steps.py index 359ade36..811daf30 100644 --- a/bigml/tests/create_model_steps.py +++ b/bigml/tests/create_model_steps.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=locally-disabled,unused-argument,no-member # -# Copyright 2012-2023 BigML +# Copyright 2012-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain @@ -31,6 +31,7 @@ from bigml.deepnet import Deepnet from bigml.fusion import Fusion from bigml.ensemble import Ensemble +from bigml.generators.model import get_leaves from .read_resource_steps import wait_until_status_code_is @@ -690,3 +691,8 @@ def the_cloned_logistic_regression_is(step, logistic_regression): def check_deepnet_id_local_id(step): """Checking that deepnet ID and local deepnet ID match""" eq_(world.deepnet["resource"], step.bigml["local_deepnet"].resource_id) + + +def check_leaves_number(step, leaves_number): + """Checking the number of leaves in a tree local model""" + eq_(len(get_leaves(step.bigml["local_model"])), leaves_number) diff --git a/bigml/tests/create_multimodel_steps.py b/bigml/tests/create_multimodel_steps.py index ffc5fbf8..7fe82a82 100644 --- a/bigml/tests/create_multimodel_steps.py +++ b/bigml/tests/create_multimodel_steps.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=locally-disabled,unused-argument,no-member # -# Copyright 2014-2023 BigML +# Copyright 2014-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/create_pca_steps.py b/bigml/tests/create_pca_steps.py index 0772fb11..c5a8ff09 100644 --- a/bigml/tests/create_pca_steps.py +++ b/bigml/tests/create_pca_steps.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=locally-disabled,unused-argument,no-member # -# Copyright 2018-2023 BigML +# Copyright 2018-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/create_prediction_steps.py b/bigml/tests/create_prediction_steps.py index 3432c512..978d577c 100644 --- a/bigml/tests/create_prediction_steps.py +++ b/bigml/tests/create_prediction_steps.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=locally-disabled,unused-argument,no-member # -# Copyright 2015-2023 BigML +# Copyright 2015-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/create_project_steps.py b/bigml/tests/create_project_steps.py index f28ee969..3d997bfe 100644 --- a/bigml/tests/create_project_steps.py +++ b/bigml/tests/create_project_steps.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=locally-disabled,unused-argument,no-member # -# Copyright 2014-2023 BigML +# Copyright 2014-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/create_projection_steps.py b/bigml/tests/create_projection_steps.py index 3d9be145..92df6cb7 100644 --- a/bigml/tests/create_projection_steps.py +++ b/bigml/tests/create_projection_steps.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=locally-disabled,unused-argument # -# Copyright 2018-2023 BigML +# Copyright 2018-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/create_sample_steps.py b/bigml/tests/create_sample_steps.py index 2a9029be..8f451f4b 100644 --- a/bigml/tests/create_sample_steps.py +++ b/bigml/tests/create_sample_steps.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=locally-disabled,unused-argument,no-member # -# Copyright 2015-2023 BigML +# Copyright 2015-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/create_script_steps.py b/bigml/tests/create_script_steps.py index 3cebabab..cb7ab4ed 100644 --- a/bigml/tests/create_script_steps.py +++ b/bigml/tests/create_script_steps.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=locally-disabled,unused-argument,no-member # -# Copyright 2015-2023 BigML +# Copyright 2015-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/create_source_steps.py b/bigml/tests/create_source_steps.py index 7ff1b3c7..3eac296a 100644 --- a/bigml/tests/create_source_steps.py +++ b/bigml/tests/create_source_steps.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=locally-disabled,unused-argument,no-member # -# Copyright 2012, 2015-2023 BigML +# Copyright 2012, 2015-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/create_statistical_tst_steps.py b/bigml/tests/create_statistical_tst_steps.py index 88774bc0..44e76dd4 100644 --- a/bigml/tests/create_statistical_tst_steps.py +++ b/bigml/tests/create_statistical_tst_steps.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=locally-disabled,unused-argument,no-member # -# Copyright 2015-2023 BigML +# Copyright 2015-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/create_time_series_steps.py b/bigml/tests/create_time_series_steps.py index 06e9fc82..d12fc2c8 100644 --- a/bigml/tests/create_time_series_steps.py +++ b/bigml/tests/create_time_series_steps.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=locally-disabled,unused-argument,no-member # -# Copyright 2017-2023 BigML +# Copyright 2017-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/delete_project_steps.py b/bigml/tests/delete_project_steps.py index 83be7f13..49d6ddb6 100644 --- a/bigml/tests/delete_project_steps.py +++ b/bigml/tests/delete_project_steps.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=locally-disabled,unused-argument,no-member,broad-except # -# Copyright 2014-2023 BigML +# Copyright 2014-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/fields_steps.py b/bigml/tests/fields_steps.py index 4df3e12e..59336ea5 100644 --- a/bigml/tests/fields_steps.py +++ b/bigml/tests/fields_steps.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=locally-disabled,unused-argument,no-member # -# Copyright 2015-2023 BigML +# Copyright 2015-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/inspect_model_steps.py b/bigml/tests/inspect_model_steps.py index 8b2bd637..a13c90ac 100644 --- a/bigml/tests/inspect_model_steps.py +++ b/bigml/tests/inspect_model_steps.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=locally-disabled,unused-argument,no-member # -# Copyright 2012, 2015-2023 BigML +# Copyright 2012, 2015-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/read_dataset_steps.py b/bigml/tests/read_dataset_steps.py index b1f37e89..026b361c 100644 --- a/bigml/tests/read_dataset_steps.py +++ b/bigml/tests/read_dataset_steps.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=locally-disabled,no-member # -# Copyright 2012-2023 BigML +# Copyright 2012-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/read_resource_steps.py b/bigml/tests/read_resource_steps.py index d5f5070d..bf702e04 100644 --- a/bigml/tests/read_resource_steps.py +++ b/bigml/tests/read_resource_steps.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright 2014-2023 BigML +# Copyright 2014-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain @@ -14,6 +14,7 @@ # License for the specific language governing permissions and limitations # under the License. +import time from datetime import datetime @@ -46,6 +47,7 @@ def wait_until_status_code_is(code1, code2, secs, resource_info): if status['code'] == int(code2): world.errors.append(resource_info) eq_(status['code'], int(code1)) + time.sleep(0.1) # added to avoid synch mongo issues return i_get_the_resource(resource_info) diff --git a/bigml/tests/test_01_prediction.py b/bigml/tests/test_01_prediction.py index 9d416e91..7a97fd6d 100644 --- a/bigml/tests/test_01_prediction.py +++ b/bigml/tests/test_01_prediction.py @@ -2,7 +2,7 @@ #pylint: disable=locally-disabled,line-too-long,attribute-defined-outside-init #pylint: disable=locally-disabled,unused-import # -# Copyright 2015-2023 BigML +# Copyright 2015-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/test_03_local_prediction.py b/bigml/tests/test_03_local_prediction.py index 04cbb06a..e746accd 100644 --- a/bigml/tests/test_03_local_prediction.py +++ b/bigml/tests/test_03_local_prediction.py @@ -2,7 +2,7 @@ #pylint: disable=locally-disabled,line-too-long,attribute-defined-outside-init #pylint: disable=locally-disabled,unused-import # -# Copyright 2015-2023 BigML +# Copyright 2015-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/test_04_multivote_prediction.py b/bigml/tests/test_04_multivote_prediction.py index 2ec448dd..b66f5abd 100644 --- a/bigml/tests/test_04_multivote_prediction.py +++ b/bigml/tests/test_04_multivote_prediction.py @@ -2,7 +2,7 @@ #pylint: disable=locally-disabled,line-too-long,attribute-defined-outside-init #pylint: disable=locally-disabled,unused-import # -# Copyright 2015-2023 BigML +# Copyright 2015-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/test_05_compare_predictions.py b/bigml/tests/test_05_compare_predictions.py index c9a8f1df..7cebde55 100644 --- a/bigml/tests/test_05_compare_predictions.py +++ b/bigml/tests/test_05_compare_predictions.py @@ -2,7 +2,7 @@ #pylint: disable=locally-disabled,line-too-long,attribute-defined-outside-init #pylint: disable=locally-disabled,unused-import # -# Copyright 2015-2023 BigML +# Copyright 2015-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/test_05_compare_predictions_b.py b/bigml/tests/test_05_compare_predictions_b.py index 379a16fc..65097657 100644 --- a/bigml/tests/test_05_compare_predictions_b.py +++ b/bigml/tests/test_05_compare_predictions_b.py @@ -2,7 +2,7 @@ #pylint: disable=locally-disabled,line-too-long,attribute-defined-outside-init #pylint: disable=locally-disabled,unused-import # -# Copyright 2015-2023 BigML +# Copyright 2015-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/test_06_batch_predictions.py b/bigml/tests/test_06_batch_predictions.py index 1716589c..89266f8b 100644 --- a/bigml/tests/test_06_batch_predictions.py +++ b/bigml/tests/test_06_batch_predictions.py @@ -2,7 +2,7 @@ #pylint: disable=locally-disabled,line-too-long,attribute-defined-outside-init #pylint: disable=locally-disabled,unused-import # -# Copyright 2015-2023 BigML +# Copyright 2015-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/test_07_multimodel_batch_predictions.py b/bigml/tests/test_07_multimodel_batch_predictions.py index 0a0838e6..a19ea4ca 100644 --- a/bigml/tests/test_07_multimodel_batch_predictions.py +++ b/bigml/tests/test_07_multimodel_batch_predictions.py @@ -2,7 +2,7 @@ #pylint: disable=locally-disabled,line-too-long,attribute-defined-outside-init #pylint: disable=locally-disabled,unused-import # -# Copyright 2015-2023 BigML +# Copyright 2015-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/test_08_multimodel.py b/bigml/tests/test_08_multimodel.py index 78b3e30a..c9ac4d1b 100644 --- a/bigml/tests/test_08_multimodel.py +++ b/bigml/tests/test_08_multimodel.py @@ -2,7 +2,7 @@ #pylint: disable=locally-disabled,line-too-long,attribute-defined-outside-init #pylint: disable=locally-disabled,unused-import # -# Copyright 2015-2023 BigML +# Copyright 2015-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/test_09_ensemble_prediction.py b/bigml/tests/test_09_ensemble_prediction.py index ec11cb1e..52b06872 100644 --- a/bigml/tests/test_09_ensemble_prediction.py +++ b/bigml/tests/test_09_ensemble_prediction.py @@ -2,7 +2,7 @@ #pylint: disable=locally-disabled,line-too-long,attribute-defined-outside-init #pylint: disable=locally-disabled,unused-import # -# Copyright 2015-2023 BigML +# Copyright 2015-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/test_10_local_ensemble_prediction.py b/bigml/tests/test_10_local_ensemble_prediction.py index 997ff0db..2e35f1b0 100644 --- a/bigml/tests/test_10_local_ensemble_prediction.py +++ b/bigml/tests/test_10_local_ensemble_prediction.py @@ -2,7 +2,7 @@ #pylint: disable=locally-disabled,line-too-long,attribute-defined-outside-init #pylint: disable=locally-disabled,unused-import # -# Copyright 2015-2023 BigML +# Copyright 2015-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/test_11_multimodel_prediction.py b/bigml/tests/test_11_multimodel_prediction.py index b8c2699b..23021c1d 100644 --- a/bigml/tests/test_11_multimodel_prediction.py +++ b/bigml/tests/test_11_multimodel_prediction.py @@ -2,7 +2,7 @@ #pylint: disable=locally-disabled,line-too-long,attribute-defined-outside-init #pylint: disable=locally-disabled,unused-import # -# Copyright 2015-2023 BigML +# Copyright 2015-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/test_12_public_model_prediction.py b/bigml/tests/test_12_public_model_prediction.py index 8d52d974..cbfe2e36 100644 --- a/bigml/tests/test_12_public_model_prediction.py +++ b/bigml/tests/test_12_public_model_prediction.py @@ -2,7 +2,7 @@ #pylint: disable=locally-disabled,line-too-long,attribute-defined-outside-init #pylint: disable=locally-disabled,unused-import # -# Copyright 2015-2023 BigML +# Copyright 2015-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/test_13_public_dataset.py b/bigml/tests/test_13_public_dataset.py index 145dcaf7..94657661 100644 --- a/bigml/tests/test_13_public_dataset.py +++ b/bigml/tests/test_13_public_dataset.py @@ -2,7 +2,7 @@ #pylint: disable=locally-disabled,line-too-long,attribute-defined-outside-init #pylint: disable=locally-disabled,unused-import # -# Copyright 2015-2023 BigML +# Copyright 2015-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/test_14_create_evaluations.py b/bigml/tests/test_14_create_evaluations.py index 316a9460..093dc638 100644 --- a/bigml/tests/test_14_create_evaluations.py +++ b/bigml/tests/test_14_create_evaluations.py @@ -2,7 +2,7 @@ #pylint: disable=locally-disabled,line-too-long,attribute-defined-outside-init #pylint: disable=locally-disabled,unused-import # -# Copyright 2015-2023 BigML +# Copyright 2015-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain @@ -194,7 +194,7 @@ def test_scenario4(self): "evaluation_wait", "metric", "value"] examples = [ ['data/iris.csv', '50', '50', '800', '80', 'average_phi', - '0.97007']] + '0.98029']] for example in examples: example = dict(zip(headers, example)) show_method(self, self.bigml["method"], example) diff --git a/bigml/tests/test_15_download.py b/bigml/tests/test_15_download.py index bfba8f70..415257e2 100644 --- a/bigml/tests/test_15_download.py +++ b/bigml/tests/test_15_download.py @@ -2,7 +2,7 @@ #pylint: disable=locally-disabled,line-too-long,attribute-defined-outside-init #pylint: disable=locally-disabled,unused-import # -# Copyright 2015-2023 BigML +# Copyright 2015-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/test_16_sample_dataset.py b/bigml/tests/test_16_sample_dataset.py index 46c6280c..186b76ef 100644 --- a/bigml/tests/test_16_sample_dataset.py +++ b/bigml/tests/test_16_sample_dataset.py @@ -2,7 +2,7 @@ #pylint: disable=locally-disabled,line-too-long,attribute-defined-outside-init #pylint: disable=locally-disabled,unused-import # -# Copyright 2015-2023 BigML +# Copyright 2015-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/test_17_split_dataset.py b/bigml/tests/test_17_split_dataset.py index 73056e4f..c570ea12 100644 --- a/bigml/tests/test_17_split_dataset.py +++ b/bigml/tests/test_17_split_dataset.py @@ -2,7 +2,7 @@ #pylint: disable=locally-disabled,line-too-long,attribute-defined-outside-init #pylint: disable=locally-disabled,unused-import # -# Copyright 2015-2023 BigML +# Copyright 2015-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/test_18_create_anomaly.py b/bigml/tests/test_18_create_anomaly.py index 33e4098e..b38adfa6 100644 --- a/bigml/tests/test_18_create_anomaly.py +++ b/bigml/tests/test_18_create_anomaly.py @@ -2,7 +2,7 @@ #pylint: disable=locally-disabled,line-too-long,attribute-defined-outside-init #pylint: disable=locally-disabled,unused-import # -# Copyright 2015-2023 BigML +# Copyright 2015-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/test_19_missing_and_errors.py b/bigml/tests/test_19_missing_and_errors.py index d8c41968..22326c08 100644 --- a/bigml/tests/test_19_missing_and_errors.py +++ b/bigml/tests/test_19_missing_and_errors.py @@ -2,7 +2,7 @@ #pylint: disable=locally-disabled,line-too-long,attribute-defined-outside-init #pylint: disable=locally-disabled,unused-import # -# Copyright 2015-2023 BigML +# Copyright 2015-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/test_20_rename_duplicated_names.py b/bigml/tests/test_20_rename_duplicated_names.py index dadf9d30..ac2def75 100644 --- a/bigml/tests/test_20_rename_duplicated_names.py +++ b/bigml/tests/test_20_rename_duplicated_names.py @@ -2,7 +2,7 @@ #pylint: disable=locally-disabled,line-too-long,attribute-defined-outside-init #pylint: disable=locally-disabled,unused-import # -# Copyright 2015-2023 BigML +# Copyright 2015-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/test_21_projects.py b/bigml/tests/test_21_projects.py index 2c61acc1..b58f6d0a 100644 --- a/bigml/tests/test_21_projects.py +++ b/bigml/tests/test_21_projects.py @@ -2,7 +2,7 @@ #pylint: disable=locally-disabled,line-too-long,attribute-defined-outside-init #pylint: disable=locally-disabled,unused-import # -# Copyright 2015-2023 BigML +# Copyright 2015-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/test_22_source_args.py b/bigml/tests/test_22_source_args.py index 7c2b6e3a..b66edc9e 100644 --- a/bigml/tests/test_22_source_args.py +++ b/bigml/tests/test_22_source_args.py @@ -2,7 +2,7 @@ #pylint: disable=locally-disabled,line-too-long,attribute-defined-outside-init #pylint: disable=locally-disabled,unused-import,no-member # -# Copyright 2015-2023 BigML +# Copyright 2015-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain @@ -25,6 +25,7 @@ from .world import world, setup_module, teardown_module, show_doc, \ show_method from . import create_source_steps as source_create +from . import create_dataset_steps as dataset_create class TestUploadSource: @@ -125,3 +126,40 @@ def test_scenario3(self): source_create.the_source_is_finished( self, example["source_wait"]) source_create.the_cloned_source_origin_is(self, source) + + def test_scenario4(self): + """ + Scenario: Successfully adding annotatations to composite source: + Given I create an annotated images data source uploading a "" file + And I wait until the source is ready less than secs + And I create a dataset + And I wait until the dataset is ready less than secs + Then the new dataset has annotations in the field + """ + headers = ["data", "source_wait", "dataset_wait", "annotations_num", + "annotations_field"] + examples = [ + ['data/images/metadata.json', '500', '500', '12', + '100002'], + ['data/images/metadata_compact.json', '500', '500', '3', + '100003'], + ['data/images/metadata_list.json', '500', '500', '3', + '100003']] + show_doc(self.test_scenario4) + for example in examples: + example = dict(zip(headers, example)) + show_method(self, self.bigml["method"], example) + source_create.i_create_annotated_source( + self, + example["data"], + args={"image_analysis": {"enabled": False, + "extracted_features": []}}) + source_create.the_source_is_finished( + self, example["source_wait"]) + dataset_create.i_create_a_dataset(self) + dataset_create.the_dataset_is_finished_in_less_than( + self, example["dataset_wait"]) + dataset_create.check_annotations(self, + example["annotations_field"], + example["annotations_num"]) + diff --git a/bigml/tests/test_23_local_model_info.py b/bigml/tests/test_23_local_model_info.py index 18e82a48..8ee0ac97 100644 --- a/bigml/tests/test_23_local_model_info.py +++ b/bigml/tests/test_23_local_model_info.py @@ -2,7 +2,7 @@ #pylint: disable=locally-disabled,line-too-long,attribute-defined-outside-init #pylint: disable=locally-disabled,unused-import # -# Copyright 2015-2023 BigML +# Copyright 2015-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/test_24_cluster_derived.py b/bigml/tests/test_24_cluster_derived.py index e2e7d588..5e565463 100644 --- a/bigml/tests/test_24_cluster_derived.py +++ b/bigml/tests/test_24_cluster_derived.py @@ -2,7 +2,7 @@ #pylint: disable=locally-disabled,line-too-long,attribute-defined-outside-init #pylint: disable=locally-disabled,unused-import # -# Copyright 2015-2023 BigML +# Copyright 2015-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/test_25_correlation.py b/bigml/tests/test_25_correlation.py index 5812bf32..27f4c029 100644 --- a/bigml/tests/test_25_correlation.py +++ b/bigml/tests/test_25_correlation.py @@ -2,7 +2,7 @@ #pylint: disable=locally-disabled,line-too-long,attribute-defined-outside-init #pylint: disable=locally-disabled,unused-import # -# Copyright 2015-2023 BigML +# Copyright 2015-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/test_26_statistical_test.py b/bigml/tests/test_26_statistical_test.py index 332e9988..b09ebd48 100644 --- a/bigml/tests/test_26_statistical_test.py +++ b/bigml/tests/test_26_statistical_test.py @@ -2,7 +2,7 @@ #pylint: disable=locally-disabled,line-too-long,attribute-defined-outside-init #pylint: disable=locally-disabled,unused-import # -# Copyright 2015-2023 BigML +# Copyright 2015-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/test_27_fields.py b/bigml/tests/test_27_fields.py index a6b0892f..bd461f04 100644 --- a/bigml/tests/test_27_fields.py +++ b/bigml/tests/test_27_fields.py @@ -2,7 +2,7 @@ #pylint: disable=locally-disabled,line-too-long,attribute-defined-outside-init #pylint: disable=locally-disabled,unused-import # -# Copyright 2015-2023 BigML +# Copyright 2015-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/test_28_association.py b/bigml/tests/test_28_association.py index 60b3015f..7e5bec63 100644 --- a/bigml/tests/test_28_association.py +++ b/bigml/tests/test_28_association.py @@ -2,7 +2,7 @@ #pylint: disable=locally-disabled,line-too-long,attribute-defined-outside-init #pylint: disable=locally-disabled,unused-import # -# Copyright 2015-2023 BigML +# Copyright 2015-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/test_29_script.py b/bigml/tests/test_29_script.py index 0bd8e7bc..eb5bc752 100644 --- a/bigml/tests/test_29_script.py +++ b/bigml/tests/test_29_script.py @@ -2,7 +2,7 @@ #pylint: disable=locally-disabled,line-too-long,attribute-defined-outside-init #pylint: disable=locally-disabled,unused-import # -# Copyright 2015-2023 BigML +# Copyright 2015-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/test_30_execution.py b/bigml/tests/test_30_execution.py index 17d8d25b..e1864d5c 100644 --- a/bigml/tests/test_30_execution.py +++ b/bigml/tests/test_30_execution.py @@ -2,7 +2,7 @@ #pylint: disable=locally-disabled,line-too-long,attribute-defined-outside-init #pylint: disable=locally-disabled,unused-import # -# Copyright 2015-2023 BigML +# Copyright 2015-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/test_31_library.py b/bigml/tests/test_31_library.py index be877fe0..9de406c8 100644 --- a/bigml/tests/test_31_library.py +++ b/bigml/tests/test_31_library.py @@ -2,7 +2,7 @@ #pylint: disable=locally-disabled,line-too-long,attribute-defined-outside-init #pylint: disable=locally-disabled,unused-import # -# Copyright 2015-2023 BigML +# Copyright 2015-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/test_32_topic_model_prediction.py b/bigml/tests/test_32_topic_model_prediction.py index f1899f09..fd26e407 100644 --- a/bigml/tests/test_32_topic_model_prediction.py +++ b/bigml/tests/test_32_topic_model_prediction.py @@ -2,7 +2,7 @@ #pylint: disable=locally-disabled,line-too-long,attribute-defined-outside-init #pylint: disable=locally-disabled,unused-import # -# Copyright 2016-2023 BigML +# Copyright 2016-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/test_33_compare_predictions.py b/bigml/tests/test_33_compare_predictions.py index 6dc0bf72..cf322c36 100644 --- a/bigml/tests/test_33_compare_predictions.py +++ b/bigml/tests/test_33_compare_predictions.py @@ -2,7 +2,7 @@ #pylint: disable=locally-disabled,line-too-long,attribute-defined-outside-init #pylint: disable=locally-disabled,unused-import # -# Copyright 2015-2023 BigML +# Copyright 2015-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/test_34_time_series.py b/bigml/tests/test_34_time_series.py index ebf997af..4b5fb472 100644 --- a/bigml/tests/test_34_time_series.py +++ b/bigml/tests/test_34_time_series.py @@ -2,7 +2,7 @@ #pylint: disable=locally-disabled,line-too-long,attribute-defined-outside-init #pylint: disable=locally-disabled,unused-import # -# Copyright 2017-2023 BigML +# Copyright 2017-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/test_35_b_compare_predictions.py b/bigml/tests/test_35_b_compare_predictions.py index 0002b9f9..7b768ff6 100644 --- a/bigml/tests/test_35_b_compare_predictions.py +++ b/bigml/tests/test_35_b_compare_predictions.py @@ -2,7 +2,7 @@ #pylint: disable=locally-disabled,line-too-long,attribute-defined-outside-init #pylint: disable=locally-disabled,unused-import # -# Copyright 2017-2023 BigML +# Copyright 2017-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/test_35_c_compare_predictions.py b/bigml/tests/test_35_c_compare_predictions.py index e45e5b87..0a39e66d 100644 --- a/bigml/tests/test_35_c_compare_predictions.py +++ b/bigml/tests/test_35_c_compare_predictions.py @@ -2,7 +2,7 @@ #pylint: disable=locally-disabled,line-too-long,attribute-defined-outside-init #pylint: disable=locally-disabled,unused-import # -# Copyright 2017-2023 BigML +# Copyright 2017-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/test_35_compare_predictions.py b/bigml/tests/test_35_compare_predictions.py index 16c54176..248b9520 100644 --- a/bigml/tests/test_35_compare_predictions.py +++ b/bigml/tests/test_35_compare_predictions.py @@ -2,7 +2,7 @@ #pylint: disable=locally-disabled,line-too-long,attribute-defined-outside-init #pylint: disable=locally-disabled,unused-import # -# Copyright 2017-2023 BigML +# Copyright 2017-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/test_35_d_compare_predictions.py b/bigml/tests/test_35_d_compare_predictions.py index 023d3830..442ac2cf 100644 --- a/bigml/tests/test_35_d_compare_predictions.py +++ b/bigml/tests/test_35_d_compare_predictions.py @@ -2,7 +2,7 @@ #pylint: disable=locally-disabled,line-too-long,attribute-defined-outside-init #pylint: disable=locally-disabled,unused-import # -# Copyright 2017-2023 BigML +# Copyright 2017-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/test_35_e_compare_predictions.py b/bigml/tests/test_35_e_compare_predictions.py index bd408698..b998b1a4 100644 --- a/bigml/tests/test_35_e_compare_predictions.py +++ b/bigml/tests/test_35_e_compare_predictions.py @@ -2,7 +2,7 @@ #pylint: disable=locally-disabled,line-too-long,attribute-defined-outside-init #pylint: disable=locally-disabled,unused-import # -# Copyright 2017-2023 BigML +# Copyright 2017-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/test_36_compare_predictions.py b/bigml/tests/test_36_compare_predictions.py index ae5c5954..c8a76e3d 100644 --- a/bigml/tests/test_36_compare_predictions.py +++ b/bigml/tests/test_36_compare_predictions.py @@ -2,7 +2,7 @@ #pylint: disable=locally-disabled,line-too-long,attribute-defined-outside-init #pylint: disable=locally-disabled,unused-import # -# Copyright 2017-2023 BigML +# Copyright 2017-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain @@ -77,7 +77,7 @@ def test_scenario1(self): 'Iris-versicolor', '{}'], ['data/iris_missing2.csv', '30', '50', '60', '{}', '000004', 'Iris-versicolor', '{}'], - ['data/grades.csv', '30', '50', '60', '{}', '000005', 55.6560, + ['data/grades.csv', '30', '50', '60', '{}', '000005', 47.04852, '{}'], ['data/spam.csv', '30', '50', '60', '{}', '000000', 'ham', '{}']] show_doc(self.test_scenario1) diff --git a/bigml/tests/test_37_configuration.py b/bigml/tests/test_37_configuration.py index fecd0da3..1c4ba9ac 100644 --- a/bigml/tests/test_37_configuration.py +++ b/bigml/tests/test_37_configuration.py @@ -2,7 +2,7 @@ #pylint: disable=locally-disabled,line-too-long,attribute-defined-outside-init #pylint: disable=locally-disabled,unused-import # -# Copyright 2015-2023 BigML +# Copyright 2015-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/test_38_organization.py b/bigml/tests/test_38_organization.py index 2d699fc8..4187a474 100644 --- a/bigml/tests/test_38_organization.py +++ b/bigml/tests/test_38_organization.py @@ -2,7 +2,7 @@ #pylint: disable=locally-disabled,line-too-long,attribute-defined-outside-init #pylint: disable=locally-disabled,unused-import # -# Copyright 2018-2023 BigML +# Copyright 2018-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/test_38_project_connection.py b/bigml/tests/test_38_project_connection.py index 38a9aeba..7175d8a6 100644 --- a/bigml/tests/test_38_project_connection.py +++ b/bigml/tests/test_38_project_connection.py @@ -2,7 +2,7 @@ #pylint: disable=locally-disabled,line-too-long,attribute-defined-outside-init #pylint: disable=locally-disabled,unused-import,broad-except # -# Copyright 2018-2023 BigML +# Copyright 2018-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/test_39_optiml_fusion.py b/bigml/tests/test_39_optiml_fusion.py index 4273e371..0ff5992f 100644 --- a/bigml/tests/test_39_optiml_fusion.py +++ b/bigml/tests/test_39_optiml_fusion.py @@ -2,7 +2,7 @@ #pylint: disable=locally-disabled,line-too-long,attribute-defined-outside-init #pylint: disable=locally-disabled,unused-import # -# Copyright 2018-2023 BigML +# Copyright 2018-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/test_40_local_from_file.py b/bigml/tests/test_40_local_from_file.py index eba94bec..c8311285 100644 --- a/bigml/tests/test_40_local_from_file.py +++ b/bigml/tests/test_40_local_from_file.py @@ -2,7 +2,7 @@ #pylint: disable=locally-disabled,line-too-long,attribute-defined-outside-init #pylint: disable=locally-disabled,unused-import # -# Copyright 2018-2023 BigML +# Copyright 2018-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain @@ -66,17 +66,18 @@ def test_scenario1(self): When I create a local model from the file "" Then the model ID and the local model ID match And the prediction for "" is "" + And the number of leaves is "" """ show_doc(self.test_scenario1) headers = ["data", "source_wait", "dataset_wait", "model_wait", "pmml", "exported_file", "input_data", "prediction", - "model_conf"] + "model_conf", 'leaves#'] examples = [ ['data/iris.csv', '10', '10', '10', False, - './tmp/model.json', {}, "Iris-setosa", '{}'], + './tmp/model.json', {}, "Iris-setosa", '{}', 9], ['data/iris.csv', '10', '10', '10', False, './tmp/model_dft.json', {}, "Iris-versicolor", - '{"default_numeric_value": "mean"}']] + '{"default_numeric_value": "mean"}', 9]] for example in examples: example = dict(zip(headers, example)) show_method(self, self.bigml["method"], example) @@ -97,6 +98,7 @@ def test_scenario1(self): model_create.check_model_id_local_id(self) model_create.local_model_prediction_is( self, example["input_data"], example["prediction"]) + model_create.check_leaves_number(self, example["leaves#"]) def test_scenario2(self): """ @@ -211,7 +213,7 @@ def test_scenario4(self): ['data/iris.csv', '10', '10', '500', './tmp/deepnet.json', {}, 'Iris-versicolor', '{}'], ['data/iris.csv', '10', '10', '500', './tmp/deepnet_dft.json', {}, - 'Iris-virginica', '{"default_numeric_value": "maximum"}']] + 'Iris-versicolor', '{"default_numeric_value": "maximum"}']] for example in examples: example = dict(zip(headers, example)) show_method(self, self.bigml["method"], example) diff --git a/bigml/tests/test_41_multidataset.py b/bigml/tests/test_41_multidataset.py index ad966e79..e0c8f1b3 100644 --- a/bigml/tests/test_41_multidataset.py +++ b/bigml/tests/test_41_multidataset.py @@ -2,7 +2,7 @@ #pylint: disable=locally-disabled,line-too-long,attribute-defined-outside-init #pylint: disable=locally-disabled,unused-import # -# Copyright 2018-2023 BigML +# Copyright 2018-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/test_42_pca.py b/bigml/tests/test_42_pca.py index ef67391d..706305bf 100644 --- a/bigml/tests/test_42_pca.py +++ b/bigml/tests/test_42_pca.py @@ -2,7 +2,7 @@ #pylint: disable=locally-disabled,line-too-long,attribute-defined-outside-init #pylint: disable=locally-disabled,unused-import # -# Copyright 2018-2023 BigML +# Copyright 2018-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/test_43_linear.py b/bigml/tests/test_43_linear.py index 6b8a3a65..a9a20ecb 100644 --- a/bigml/tests/test_43_linear.py +++ b/bigml/tests/test_43_linear.py @@ -2,7 +2,7 @@ #pylint: disable=locally-disabled,line-too-long,attribute-defined-outside-init #pylint: disable=locally-disabled,unused-import # -# Copyright 2019-2023 BigML +# Copyright 2019-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/test_44_compare_predictions.py b/bigml/tests/test_44_compare_predictions.py index 44e4868e..c50a6350 100644 --- a/bigml/tests/test_44_compare_predictions.py +++ b/bigml/tests/test_44_compare_predictions.py @@ -2,7 +2,7 @@ #pylint: disable=locally-disabled,line-too-long,attribute-defined-outside-init #pylint: disable=locally-disabled,unused-import # -# Copyright 2015-2023 BigML +# Copyright 2015-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/test_45_external_connector.py b/bigml/tests/test_45_external_connector.py index 28162ef7..deac2c94 100644 --- a/bigml/tests/test_45_external_connector.py +++ b/bigml/tests/test_45_external_connector.py @@ -2,7 +2,7 @@ #pylint: disable=locally-disabled,line-too-long,attribute-defined-outside-init #pylint: disable=locally-disabled,unused-import # -# Copyright 2018-2023 BigML +# Copyright 2018-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/test_47_webhooks.py b/bigml/tests/test_47_webhooks.py index 09e34fc0..3206f0ef 100644 --- a/bigml/tests/test_47_webhooks.py +++ b/bigml/tests/test_47_webhooks.py @@ -2,7 +2,7 @@ #pylint: disable=locally-disabled,line-too-long,attribute-defined-outside-init #pylint: disable=locally-disabled,unused-import,invalid-name # -# Copyright 2022-2023 BigML +# Copyright 2022-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/test_48_local_dataset.py b/bigml/tests/test_48_local_dataset.py index a57787d4..eabd52f1 100644 --- a/bigml/tests/test_48_local_dataset.py +++ b/bigml/tests/test_48_local_dataset.py @@ -2,7 +2,7 @@ #pylint: disable=locally-disabled,line-too-long,attribute-defined-outside-init #pylint: disable=locally-disabled,unused-import # -# Copyright 2022-2023 BigML +# Copyright 2022-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/test_49_local_pipeline.py b/bigml/tests/test_49_local_pipeline.py index 204c11f8..651a87a3 100644 --- a/bigml/tests/test_49_local_pipeline.py +++ b/bigml/tests/test_49_local_pipeline.py @@ -2,7 +2,7 @@ #pylint: disable=locally-disabled,line-too-long,attribute-defined-outside-init #pylint: disable=locally-disabled,unused-import # -# Copyright 2022-2023 BigML +# Copyright 2022-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain @@ -210,28 +210,28 @@ def test_scenario4(self): examples = [ ['data/dates2.csv', '20', '45', '160', '{"time-1": "1910-05-08T19:10:23.106", "cat-0":"cat2"}', - '000002', -0.02616, "pipeline1"], + '000002', -0.4264, "pipeline1"], ['data/dates2.csv', '20', '45', '160', '{"time-1": "2011-04-01T00:16:45.747", "cat-0":"cat2"}', - '000002', 0.13352, "pipeline2"], + '000002', 0.11985, "pipeline2"], ['data/dates2.csv', '20', '45', '160', '{"time-1": "1969-W29-1T17:36:39Z", "cat-0":"cat1"}', - '000002', 0.10071, "pipeline3"], + '000002', -0.08211, "pipeline3"], ['data/dates2.csv', '20', '45', '160', '{"time-1": "1920-06-45T20:21:20.320", "cat-0":"cat1"}', - '000002', 0.10071, "pipeline4"], + '000002', -0.08211, "pipeline4"], ['data/dates2.csv', '20', '45', '160', '{"time-1": "2001-01-05T23:04:04.693", "cat-0":"cat2"}', - '000002', 0.15235, "pipeline5"], + '000002', 0.00388, "pipeline5"], ['data/dates2.csv', '20', '45', '160', '{"time-1": "1950-11-06T05:34:05.602", "cat-0":"cat1"}', - '000002', -0.07686, "pipeline6"], + '000002', -0.04976, "pipeline6"], ['data/dates2.csv', '20', '45', '160', '{"time-1": "1932-01-30T19:24:11.440", "cat-0":"cat2"}', - '000002', 0.0017, "pipeline7"], + '000002', -0.36264, "pipeline7"], ['data/dates2.csv', '20', '45', '160', '{"time-1": "Mon Jul 14 17:36 +0000 1969", "cat-0":"cat1"}', - '000002', 0.10071, "pipeline8"]] + '000002', -0.08211, "pipeline8"]] show_doc(self.test_scenario4) for example in examples: example = dict(zip(headers, example)) diff --git a/bigml/tests/test_99_cleaning.py b/bigml/tests/test_99_cleaning.py index 9339ec9a..1f80e98e 100644 --- a/bigml/tests/test_99_cleaning.py +++ b/bigml/tests/test_99_cleaning.py @@ -2,7 +2,7 @@ #pylint: disable=locally-disabled,line-too-long,attribute-defined-outside-init #pylint: disable=locally-disabled,unused-import,no-self-use # -# Copyright 2018-2023 BigML +# Copyright 2018-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tests/world.py b/bigml/tests/world.py index df52b101..f3c86ba2 100644 --- a/bigml/tests/world.py +++ b/bigml/tests/world.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright 2015-2023 BigML +# Copyright 2015-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/timeseries.py b/bigml/timeseries.py index d90b1edb..62c6b2f5 100644 --- a/bigml/timeseries.py +++ b/bigml/timeseries.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright 2017-2023 BigML +# Copyright 2017-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/topicmodel.py b/bigml/topicmodel.py index 8c2e56a7..abc87b5f 100644 --- a/bigml/topicmodel.py +++ b/bigml/topicmodel.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright 2016-2023 BigML +# Copyright 2016-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tree_utils.py b/bigml/tree_utils.py index 110957a6..ed033dbf 100644 --- a/bigml/tree_utils.py +++ b/bigml/tree_utils.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright 2017-2023 BigML +# Copyright 2017-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tsoutconstants.py b/bigml/tsoutconstants.py index 3f20f473..7903a6f6 100644 --- a/bigml/tsoutconstants.py +++ b/bigml/tsoutconstants.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright 2017-2023 BigML +# Copyright 2017-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/tssubmodels.py b/bigml/tssubmodels.py index 213f25ba..1e055af8 100644 --- a/bigml/tssubmodels.py +++ b/bigml/tssubmodels.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- #pylint: disable=invalid-name # -# Copyright 2017-2023 BigML +# Copyright 2017-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/bigml/util.py b/bigml/util.py index 25ccadba..df6b5d67 100644 --- a/bigml/util.py +++ b/bigml/util.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright 2012-2023 BigML +# Copyright 2012-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain @@ -28,6 +28,7 @@ import random import ast import datetime +import logging from urllib.parse import urlparse from unidecode import unidecode @@ -748,6 +749,7 @@ def get_formatted_data(input_data_list, out_format=None): inner_data_list = input_data_list.copy() return inner_data_list + #pylint: disable=locally-disabled,import-outside-toplevel def get_data_transformations(resource_id, parent_id): """Returns the pipeline that contains the tranformations and derived @@ -760,3 +762,13 @@ def get_data_transformations(resource_id, parent_id): "pipeline.") from bigml.pipeline.pipeline import BMLPipeline return BMLPipeline("dt-%s" % resource_id, [parent_id]) + + +def sensenet_logging(): + """Removes warnings unnecessary logging when using sensenet""" + logging.disable(logging.WARNING) + os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' + os.environ["TF_USE_LEGACY_KERAS"] = "1" + import tensorflow as tf + tf.autograph.set_verbosity(0) + logging.getLogger("tensorflow").setLevel(logging.ERROR) diff --git a/bigml/version.py b/bigml/version.py index 9865fb2a..68512901 100644 --- a/bigml/version.py +++ b/bigml/version.py @@ -1 +1 @@ -__version__ = '9.8.0.dev1' +__version__ = '9.8.3' diff --git a/bigml/webhooks.py b/bigml/webhooks.py index a5db5abe..a1f762e5 100644 --- a/bigml/webhooks.py +++ b/bigml/webhooks.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # -# Copyright 2022-2023 BigML +# Copyright 2022-2025 BigML # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain diff --git a/data/images/annotations_compact.json b/data/images/annotations_compact.json new file mode 100644 index 00000000..294de440 --- /dev/null +++ b/data/images/annotations_compact.json @@ -0,0 +1,2 @@ +[{"file": "f1/fruits1f.png", "my_regions": "[[\"region1\" 0.2 0.2 0.4 0.4]]"}, + {"file": "f1/fruits1.png", "my_regions": "[[\"region2\" 0.3 0.3 0.5 0.5] [\"region1\" 0.6 0.6 0.8 0.8]]"}] diff --git a/data/images/annotations_list.json b/data/images/annotations_list.json new file mode 100644 index 00000000..ecfee3db --- /dev/null +++ b/data/images/annotations_list.json @@ -0,0 +1,2 @@ +[{"file": "f1/fruits1f.png", "my_regions": [{"label": "region1", "xmin": 0.2, "ymin": 0.2, "xmax": 0.4, "ymax": 0.4}]}, + {"file": "f1/fruits1.png", "my_regions": [{"label": "region2", "xmin": 0.2, "ymin": 0.2, "xmax": 0.4, "ymax": 0.4}, {"label": "region1", "xmin": 0.5, "ymin": 0.5, "xmax": 0.7, "ymax": 0.7}]}] diff --git a/data/images/metadata_compact.json b/data/images/metadata_compact.json new file mode 100644 index 00000000..45db412f --- /dev/null +++ b/data/images/metadata_compact.json @@ -0,0 +1,5 @@ +{"description": "Fruit images to test colour distributions with regions", + "images_file": "./fruits_hist.zip", + "new_fields": [{"name": "my_regions", "optype": "regions"}], + "source_id": null, + "annotations": "./annotations_compact.json"} diff --git a/data/images/metadata_list.json b/data/images/metadata_list.json new file mode 100644 index 00000000..1bf61c67 --- /dev/null +++ b/data/images/metadata_list.json @@ -0,0 +1,5 @@ +{"description": "Fruit images to test colour distributions with regions", + "images_file": "./fruits_hist.zip", + "new_fields": [{"name": "my_regions", "optype": "regions"}], + "source_id": null, + "annotations": "./annotations_list.json"} diff --git a/docs/101_anomaly.rst b/docs/101_anomaly.rst index 3ad31416..03fc9c31 100644 --- a/docs/101_anomaly.rst +++ b/docs/101_anomaly.rst @@ -1,8 +1,8 @@ .. toctree:: :hidden: -BigML Bindings: 101 - Using an anomaly detector -=============================================== +101 - Anomaly detector usage +============================ Following the schema described in the `prediction workflow `_, document, this is the code snippet that shows the minimal workflow to diff --git a/docs/101_association.rst b/docs/101_association.rst index e7cdb7e4..371456a2 100644 --- a/docs/101_association.rst +++ b/docs/101_association.rst @@ -1,8 +1,8 @@ .. toctree:: :hidden: -BigML Bindings: 101 - Using Association Discovery -================================================= +101 - Association Discovery usage +================================= Following the schema described in the `prediction workflow `_, document, this is the code snippet that shows the minimal workflow to diff --git a/docs/101_cluster.rst b/docs/101_cluster.rst index a7da2d2e..d4998463 100644 --- a/docs/101_cluster.rst +++ b/docs/101_cluster.rst @@ -1,8 +1,8 @@ .. toctree:: :hidden: -BigML Bindings: 101 - Using a Cluster -===================================== +101 - Cluster Usage +=================== Following the schema described in the `prediction workflow `_, document, this is the code snippet that shows the minimal workflow to diff --git a/docs/101_deepnet.rst b/docs/101_deepnet.rst index 8fe4330d..c8f1d2c6 100644 --- a/docs/101_deepnet.rst +++ b/docs/101_deepnet.rst @@ -1,8 +1,8 @@ .. toctree:: :hidden: -BigML Bindings: 101 - Using a Deepnet Model -=========================================== +101 - Deepnet usage +=================== Following the schema described in the `prediction workflow `_, document, this is the code snippet that shows the minimal workflow to diff --git a/docs/101_ensemble.rst b/docs/101_ensemble.rst index d996bbd8..0ca3f747 100644 --- a/docs/101_ensemble.rst +++ b/docs/101_ensemble.rst @@ -1,8 +1,8 @@ .. toctree:: :hidden: -BigML Bindings: 101 - Using an Ensemble -======================================= +101 - Ensemble usage +==================== Following the schema described in the `prediction workflow `_, document, this is the code snippet that shows the minimal workflow to diff --git a/docs/101_fusion.rst b/docs/101_fusion.rst index 01352f6c..8b549759 100644 --- a/docs/101_fusion.rst +++ b/docs/101_fusion.rst @@ -1,8 +1,8 @@ .. toctree:: :hidden: -BigML Bindings: 101 - Using a Fusion Model -========================================== +101 - Fusion usage +================== Following the schema described in the `prediction workflow `_, document, this is the code snippet that shows the minimal workflow to diff --git a/docs/101_images_classification.rst b/docs/101_images_classification.rst index 22d268d1..dd6fc4eb 100644 --- a/docs/101_images_classification.rst +++ b/docs/101_images_classification.rst @@ -1,8 +1,8 @@ .. toctree:: :hidden: -BigML Bindings: 101 - Images Classification -=========================================== +101 - Images Classification +=========================== Following the schema described in the `prediction workflow `_, document, this is the code snippet that shows the minimal workflow to diff --git a/docs/101_images_feature_extraction.rst b/docs/101_images_feature_extraction.rst index 5d838e08..f649d650 100644 --- a/docs/101_images_feature_extraction.rst +++ b/docs/101_images_feature_extraction.rst @@ -1,8 +1,8 @@ .. toctree:: :hidden: -BigML Bindings: 101 - Images Feature Extraction -=============================================== +101 - Images Feature Extraction +=============================== Following the schema described in the `prediction workflow `_, document, this is the code snippet that shows the minimal workflow to diff --git a/docs/101_linear_regression.rst b/docs/101_linear_regression.rst index b610b6ea..08f87889 100644 --- a/docs/101_linear_regression.rst +++ b/docs/101_linear_regression.rst @@ -1,8 +1,8 @@ .. toctree:: :hidden: -BigML Bindings: 101 - Using a Linear Regression -================================================= +101 - Linear Regression usage +============================= Following the schema described in the `prediction workflow `_, document, this is the code snippet that shows the minimal workflow to diff --git a/docs/101_logistic_regression.rst b/docs/101_logistic_regression.rst index ed7fdec0..8cda0471 100644 --- a/docs/101_logistic_regression.rst +++ b/docs/101_logistic_regression.rst @@ -1,8 +1,8 @@ .. toctree:: :hidden: -BigML Bindings: 101 - Using a Logistic Regression -================================================= +101 - Logistic Regression usage +=============================== Following the schema described in the `prediction workflow `_, document, this is the code snippet that shows the minimal workflow to diff --git a/docs/101_model.rst b/docs/101_model.rst index 294d439a..a7bf1915 100644 --- a/docs/101_model.rst +++ b/docs/101_model.rst @@ -1,8 +1,8 @@ .. toctree:: :hidden: -BigML Bindings: 101 - Using a Decision Tree Model -================================================= +101 - Decision Tree usage +========================= Following the schema described in the `prediction workflow `_, document, this is the code snippet that shows the minimal workflow to diff --git a/docs/101_object_detection.rst b/docs/101_object_detection.rst index 0d98a0a9..b851366d 100644 --- a/docs/101_object_detection.rst +++ b/docs/101_object_detection.rst @@ -1,8 +1,8 @@ .. toctree:: :hidden: -BigML Bindings: 101 - Images Object Detection -============================================= +101 - Images Object Detection +============================= Following the schema described in the `prediction workflow `_, document, this is the code snippet that shows the minimal workflow to diff --git a/docs/101_optiml.rst b/docs/101_optiml.rst index ad96b959..cd1f7d2e 100644 --- a/docs/101_optiml.rst +++ b/docs/101_optiml.rst @@ -1,8 +1,8 @@ .. toctree:: :hidden: -BigML Bindings: 101 - Using an OptiML -===================================== +101 - OptiML usage +================== Following the schema described in the `prediction workflow `_, document, this is the code snippet that shows the minimal workflow to diff --git a/docs/101_pca.rst b/docs/101_pca.rst index 9e8ba268..2138470a 100644 --- a/docs/101_pca.rst +++ b/docs/101_pca.rst @@ -1,8 +1,8 @@ .. toctree:: :hidden: -BigML Bindings: 101 - Using a PCA -================================= +101 - PCA usage +=============== The PCA model is used to find the linear combination of your original features that best describes your data. In that sense, the goal of the model diff --git a/docs/101_scripting.rst b/docs/101_scripting.rst index 5e17ebec..aa0f05a2 100644 --- a/docs/101_scripting.rst +++ b/docs/101_scripting.rst @@ -1,8 +1,8 @@ .. toctree:: :hidden: -BigML Bindings: 101 - Creating and executing scripts -==================================================== +101 - Creating and executing scripts +==================================== The bindings offer methods to create and execute `WhizzML `_ scripts in the platform. diff --git a/docs/101_topic_model.rst b/docs/101_topic_model.rst index acabd178..065dcd2e 100644 --- a/docs/101_topic_model.rst +++ b/docs/101_topic_model.rst @@ -1,8 +1,8 @@ .. toctree:: :hidden: -BigML Bindings: 101 - Using a Topic Model -========================================= +101 - Topic Model usage +======================= Following the schema described in the `prediction workflow `_, document, this is the code snippet that shows the minimal workflow to diff --git a/docs/101_ts.rst b/docs/101_ts.rst index 9d349717..ff5388b0 100644 --- a/docs/101_ts.rst +++ b/docs/101_ts.rst @@ -1,8 +1,8 @@ .. toctree:: :hidden: -BigML Bindings: 101 - Using a Time Series -========================================= +101 - Time Series usage +======================= Following the schema described in the `prediction workflow `_, document, this is the code snippet that shows the minimal workflow to diff --git a/docs/conf.py b/docs/conf.py index 5815a33f..ac951f6b 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -30,7 +30,9 @@ # Add any Sphinx extension module names here, as strings. They can be extensions # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. -extensions = [] +extensions = [ + 'sphinx_rtd_theme' +] # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] @@ -46,7 +48,7 @@ # General information about the project. project = u'BigML' -copyright = u'2011 - 2020, The BigML Team' +copyright = u'2011 - 2024, The BigML Team' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the @@ -101,7 +103,8 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. -html_theme = 'default' +# a list of builtin themes. +html_theme = 'sphinx_rtd_theme' # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the diff --git a/docs/index.rst b/docs/index.rst index 72f76660..b2f20837 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,16 +1,35 @@ +BigML Python Bindings +===================== + +`BigML `_ makes machine learning easy by taking care +of the details required to add data-driven decisions and predictive +power to your applications. Unlike other machine learning services, BigML +creates +`beautiful predictive models `_ that +can be easily understood and interacted with. + +These BigML Python bindings allow you interacting with BigML.io, the API +for BigML. You can use it to easily create, retrieve, list, update, and +delete BigML resources (i.e., sources, datasets, models and, +predictions). + +This module is licensed under the `Apache License, Version +2.0 `_. + .. toctree:: + :maxdepth: 2 :hidden: + :caption: Basic Usage - ml_resources - creating_resources + quick_start 101_model 101_ensemble 101_deepnet 101_linear_regression 101_logistic_regression + 101_optiml 101_fusion 101_ts - 101_optiml 101_cluster 101_anomaly 101_topic_model @@ -20,40 +39,27 @@ 101_images_classification 101_images_feature_extraction 101_object_detection - reading_resources - updating_resources - deleting_resources - local_resources - whizzml_resources - -BigML Python Bindings -===================== - -`BigML `_ makes machine learning easy by taking care -of the details required to add data-driven decisions and predictive -power to your company. Unlike other machine learning services, BigML -creates -`beautiful predictive models `_ that -can be easily understood and interacted with. -These BigML Python bindings allow you to interact with BigML.io, the API -for BigML. You can use it to easily create, retrieve, list, update, and -delete BigML resources (i.e., sources, datasets, models and, -predictions). +.. toctree:: + :maxdepth: 2 + :hidden: + :caption: Resource Management -This module is licensed under the `Apache License, Version -2.0 `_. + ml_resources + creating_resources + reading_resources + updating_resources + deleting_resources -Support -------- -Please report problems and bugs to our `BigML.io issue -tracker `_. +.. toctree:: + :maxdepth: 2 + :hidden: + :caption: Client and Server Automation -Discussions about the different bindings take place in the general -`BigML mailing list `_. Or join us -in our `Campfire chatroom `_. + local_resources + whizzml_resources Requirements ------------ @@ -78,7 +84,7 @@ installed, but that is optional: we fall back to Python's built-in JSON libraries is ``simplejson`` is not found. `Node.js `_ is not installed by default, but will be -needed for `Local Pipelines `_ to work when datasets containing new added features are part of the transformation workflow. @@ -320,291 +326,6 @@ created in this environment have been moved to a special project in the now unique ``Production Environment``, so this flag is no longer needed to work with them. -Quick Start ------------ - -Imagine that you want to use `this csv -file `_ containing the `Iris -flower dataset `_ to -predict the species of a flower whose ``petal length`` is ``2.45`` and -whose ``petal width`` is ``1.75``. A preview of the dataset is shown -below. It has 4 numeric fields: ``sepal length``, ``sepal width``, -``petal length``, ``petal width`` and a categorical field: ``species``. -By default, BigML considers the last field in the dataset as the -objective field (i.e., the field that you want to generate predictions -for). - -:: - - sepal length,sepal width,petal length,petal width,species - 5.1,3.5,1.4,0.2,Iris-setosa - 4.9,3.0,1.4,0.2,Iris-setosa - 4.7,3.2,1.3,0.2,Iris-setosa - ... - 5.8,2.7,3.9,1.2,Iris-versicolor - 6.0,2.7,5.1,1.6,Iris-versicolor - 5.4,3.0,4.5,1.5,Iris-versicolor - ... - 6.8,3.0,5.5,2.1,Iris-virginica - 5.7,2.5,5.0,2.0,Iris-virginica - 5.8,2.8,5.1,2.4,Iris-virginica - -You can easily generate a prediction following these steps: - -.. code-block:: python - - from bigml.api import BigML - - api = BigML() - - source = api.create_source('./data/iris.csv') - dataset = api.create_dataset(source) - model = api.create_model(dataset) - prediction = api.create_prediction(model, \ - {"petal width": 1.75, "petal length": 2.45}) - -You can then print the prediction using the ``pprint`` method: - -.. code-block:: python - - >>> api.pprint(prediction) - species for {"petal width": 1.75, "petal length": 2.45} is Iris-setosa - -Certainly, any of the resources created in BigML can be configured using -several arguments described in the `API documentation `_. -Any of these configuration arguments can be added to the ``create`` method -as a dictionary in the last optional argument of the calls: - -.. code-block:: python - - from bigml.api import BigML - - api = BigML() - - source_args = {"name": "my source", - "source_parser": {"missing_tokens": ["NULL"]}} - source = api.create_source('./data/iris.csv', source_args) - dataset_args = {"name": "my dataset"} - dataset = api.create_dataset(source, dataset_args) - model_args = {"objective_field": "species"} - model = api.create_model(dataset, model_args) - prediction_args = {"name": "my prediction"} - prediction = api.create_prediction(model, \ - {"petal width": 1.75, "petal length": 2.45}, - prediction_args) - -The ``iris`` dataset has a small number of instances, and usually will be -instantly created, so the ``api.create_`` calls will probably return the -finished resources outright. As BigML's API is asynchronous, -in general you will need to ensure -that objects are finished before using them by using ``api.ok``. - -.. code-block:: python - - from bigml.api import BigML - - api = BigML() - - source = api.create_source('./data/iris.csv') - api.ok(source) - dataset = api.create_dataset(source) - api.ok(dataset) - model = api.create_model(dataset) - api.ok(model) - prediction = api.create_prediction(model, \ - {"petal width": 1.75, "petal length": 2.45}) - -Note that the prediction -call is not followed by the ``api.ok`` method. Predictions are so quick to be -generated that, unlike the -rest of resouces, will be generated synchronously as a finished object. - -Alternatively to the ``api.ok`` method, BigML offers -`webhooks `_ that can be set -when creating a resource and will call the url of you choice when the -finished or failed event is reached. A secret can be included in the call to -verify the webhook call authenticity, and a - -.. code-block:: python - - bigml.webhooks.check_signature(request, signature) - -function is offered to that end. As an example, this snippet creates a source -and sets a webhook to call ``https://my_webhook.com/endpoint`` when finished: - -.. code-block:: python - - from bigml.api import BigML - api = BigML() - # using a webhook with a secret - api.create_source("https://static.bigml.com/csv/iris.csv", - {"webhook": {"url": "https://my_webhook.com/endpoint", - "secret": "mysecret"}}) - - -The ``iris`` prediction example assumed that your objective -field (the one you want to predict) is the last field in the dataset. -If that's not he case, you can explicitly -set the name of this field in the creation call using the ``objective_field`` -argument: - - -.. code-block:: python - - from bigml.api import BigML - - api = BigML() - - source = api.create_source('./data/iris.csv') - api.ok(source) - dataset = api.create_dataset(source) - api.ok(dataset) - model = api.create_model(dataset, {"objective_field": "species"}) - api.ok(model) - prediction = api.create_prediction(model, \ - {'sepal length': 5, 'sepal width': 2.5}) - - -You can also generate an evaluation for the model by using: - -.. code-block:: python - - test_source = api.create_source('./data/test_iris.csv') - api.ok(test_source) - test_dataset = api.create_dataset(test_source) - api.ok(test_dataset) - evaluation = api.create_evaluation(model, test_dataset) - api.ok(evaluation) - - -The API object also offers the ``create``, ``get``, ``update`` and ``delete`` -generic methods to manage all type of resources. The type of resource to be -created is passed as first argument to the ``create`` method; - -.. code-block:: python - - from bigml.api import BigML - - api = BigML() - - source = api.create('source', './data/iris.csv') - source = api.update(source, {"name": "my new source name"}) - -Note that these methods don't need the ``api.ok`` method to be called -to wait for the resource to be finished. -The method waits internally for it by default. -This can be avoided by using ``finished=False`` as one of the arguments. - - -.. code-block:: python - - from bigml.api import BigML - - api = BigML() - - source = api.create('source', './data/iris.csv') - dataset = api.create('dataset', source, finished=False) # unfinished - api.ok(dataset) # waiting explicitly for the dataset to finish - dataset = api.update(dataset, {"name": "my_new_dataset_name"}, - finised=False) - api.ok(dataset) - -As an example for the ``delete`` and ``get`` methods, we could -create a batch prediction, put the predictions in a -dataset object and delete the ``batch_prediction``. - -.. code-block:: python - - from bigml.api import BigML - - api = BigML() - - batch_prediction = api.create('batchprediction', - 'model/5f3c3d2b5299637102000882', - 'dataset/5f29a563529963736c0116e9', - args={"output_dataset": True}) - batch_prediction_dataset = api.get(batch_prediction["object"][ \ - "output_dataset_resource"]) - api.delete(batch_prediction) - -If you set the ``storage`` argument in the ``api`` instantiation: - -.. code-block:: python - - api = BigML(storage='./storage') - -all the generated, updated or retrieved resources will be automatically -saved to the chosen directory. Once they are stored locally, the -``retrieve_resource`` method will look for the resource information -first in the local storage before trying to download the information from -the API. - -.. code-block:: python - - dataset = api.retrieve_resource("dataset/5e8e5672c7736e3d830037b5", - query_string="limit=-1") - - -Alternatively, you can use the ``export`` method to explicitly -download the JSON information -that describes any of your resources in BigML to a particular file: - -.. code-block:: python - - api.export('model/5acea49a08b07e14b9001068', - filename="my_dir/my_model.json") - -This example downloads the JSON for the model and stores it in -the ``my_dir/my_model.json`` file. - -In the case of models that can be represented in a `PMML` syntax, the -export method can be used to produce the corresponding `PMML` file. - -.. code-block:: python - - api.export('model/5acea49a08b07e14b9001068', - filename="my_dir/my_model.pmml", - pmml=True) - -You can also retrieve the last resource with some previously given tag: - -.. code-block:: python - - api.export_last("foo", - resource_type="ensemble", - filename="my_dir/my_ensemble.json") - -which selects the last ensemble that has a ``foo`` tag. This mechanism can -be specially useful when retrieving retrained models that have been created -with a shared unique keyword as tag. - -For a descriptive overview of the steps that you will usually need to -follow to model -your data and obtain predictions, please see the `basic Workflow sketch -`_ -document. You can also check other simple examples in the following documents: - -- `model 101 <101_model.html>`_ -- `logistic regression 101 <101_logistic_regression.html>`_ -- `linear regression 101 <101_linear_regression.html>`_ -- `ensemble 101 <101_ensemble.html>`_ -- `cluster 101 <101_cluster>`_ -- `anomaly detector 101 <101_anomaly.html>`_ -- `association 101 <101_association.html>`_ -- `topic model 101 <101_topic_model.html>`_ -- `deepnet 101 <101_deepnet.html>`_ -- `time series 101 <101_ts.html>`_ -- `fusion 101 <101_fusion.html>`_ -- `optiml 101 <101_optiml.html>`_ -- `PCA 101 <101_pca.html>`_ -- `scripting 101 <101_scripting.html>`_ - -And for examples on Image Processing: - -- `Images Classification 101 <101_images_classification.html>`_ -- `Object Detection 101<101_object_detection.html>`_ -- `Images Feature Extraction 101 <101_images_feature_extraction.html>`_ - Fields Structure ---------------- @@ -974,6 +695,7 @@ Install the tools required to build the documentation: .. code-block:: bash $ pip install sphinx + $ pip install sphinx-rtd-theme To build the HTML version of the documentation: @@ -984,6 +706,17 @@ To build the HTML version of the documentation: Then launch ``docs/_build/html/index.html`` in your browser. + +Support +------- + +Please report problems and bugs to our `BigML.io issue +tracker `_. + +Discussions about the different bindings take place in the general +`BigML mailing list `_. + + Additional Information ---------------------- diff --git a/docs/ml_resources.rst b/docs/ml_resources.rst index 195fb193..45ba0020 100644 --- a/docs/ml_resources.rst +++ b/docs/ml_resources.rst @@ -4,9 +4,28 @@ ML Resources ============ +This section describes the resources available in the BigML API. When retrieved +with the corresponding bindings ``get_[resource_type]`` method, they will +some common attributes, like: + +- ``resource`` which contains their ID +- ``category`` which can be set to the list of categories as defined in the + API documentation. +- ``creator`` which refers to the creator username. + +To name some. + +Beside, every resource type will have different properties as required +by its nature, that can be checked in the +`API documentation +`_. Here's a list of the different +resource types and their associated structures and properties. + +Data Ingestion and Preparation +------------------------------ External Connectors -------------------- +~~~~~~~~~~~~~~~~~~~ The ``Externalconnector`` object is is an abstract resource that helps you create ``Sources`` from several external data sources @@ -19,10 +38,10 @@ a Machine Learning resource, but a helper to connect your data repos to BigML. "externalconnector/5e30b685e476845dd901df83") You can check the external connector properties at the `API documentation -`_. +`_. Source ------- +~~~~~~ The ``Source`` is the first resource that you build in BigML when uploading a file. BigML infers the structure of the file, whether it has headers or not, @@ -33,58 +52,58 @@ the ``Source`` information: >>> source = api.get_source("source/5e30b685e476845dd901df83") >>> api.pprint(source["object"]) - { u'category': 0, - u'charset': u'UTF-8', - u'code': 200, - u'configuration': None, - u'configuration_status': False, - u'content_type': u'text/plain;UTF-8', - u'created': u'2020-01-28T22:32:37.290000', - u'creator': u'mmartin', - u'credits': 0, - u'description': u'', - u'disable_datetime': False, - u'field_types': { u'categorical': 0, - u'datetime': 0, - u'items': 0, - u'numeric': 4, - u'text': 1, - u'total': 5}, - u'fields': { u'000000': { u'column_number': 0, - u'name': u'sepal length', - u'optype': u'numeric', - u'order': 0}, - u'000001': { u'column_number': 1, - u'name': u'sepal width', - u'optype': u'numeric', - u'order': 1}, - u'000002': { u'column_number': 2, - u'name': u'petal length', - u'optype': u'numeric', - u'order': 2}, - u'000003': { u'column_number': 3, - u'name': u'petal width', - u'optype': u'numeric', - u'order': 3}, - u'000004': { u'column_number': 4, - u'name': u'species', - u'optype': u'text', - u'order': 4, - u'term_analysis': { u'enabled': True}}}, - u'fields_meta': { u'count': 5, - u'image': 0, - u'limit': 1000, - u'offset': 0, - u'query_total': 5, - u'total': 5}, + { 'category': 0, + 'charset': 'UTF-8', + 'code': 200, + 'configuration': None, + 'configuration_status': False, + 'content_type': 'text/plain;UTF-8', + 'created': '2020-01-28T22:32:37.290000', + 'creator': 'mmartin', + 'credits': 0, + 'description': '', + 'disable_datetime': False, + 'field_types': { 'categorical': 0, + 'datetime': 0, + 'items': 0, + 'numeric': 4, + 'text': 1, + 'total': 5}, + 'fields': { '000000': { 'column_number': 0, + 'name': 'sepal length', + 'optype': 'numeric', + 'order': 0}, + '000001': { 'column_number': 1, + 'name': 'sepal width', + 'optype': 'numeric', + 'order': 1}, + '000002': { 'column_number': 2, + 'name': 'petal length', + 'optype': 'numeric', + 'order': 2}, + '000003': { 'column_number': 3, + 'name': 'petal width', + 'optype': 'numeric', + 'order': 3}, + '000004': { 'column_number': 4, + 'name': 'species', + 'optype': 'text', + 'order': 4, + 'term_analysis': { 'enabled': True}}}, + 'fields_meta': { 'count': 5, + 'image': 0, + 'limit': 1000, + 'offset': 0, + 'query_total': 5, + 'total': 5}, ... } You can check the source properties at the `API documentation -`_. +`_. Dataset -------- +~~~~~~~ If you want to get some basic statistics for each field you can retrieve the ``fields`` from the dataset as follows to get a dictionary keyed by @@ -94,32 +113,32 @@ field id: >>> dataset = api.get_dataset(dataset) >>> api.pprint(api.get_fields(dataset)) - { u'000000': { u'column_number': 0, - u'datatype': u'double', - u'name': u'sepal length', - u'optype': u'numeric', - u'summary': { u'maximum': 7.9, - u'median': 5.77889, - u'minimum': 4.3, - u'missing_count': 0, - u'population': 150, - u'splits': [ 4.51526, + { '000000': { 'column_number': 0, + 'datatype': 'double', + 'name': 'sepal length', + 'optype': 'numeric', + 'summary': { 'maximum': 7.9, + 'median': 5.77889, + 'minimum': 4.3, + 'missing_count': 0, + 'population': 150, + 'splits': [ 4.51526, 4.67252, 4.81113, [... snip ... ] - u'000004': { u'column_number': 4, - u'datatype': u'string', - u'name': u'species', - u'optype': u'categorical', - u'summary': { u'categories': [ [ u'Iris-versicolor', + '000004': { 'column_number': 4, + 'datatype': 'string', + 'name': 'species', + 'optype': 'categorical', + 'summary': { 'categories': [ [ 'Iris-versicolor', 50], - [u'Iris-setosa', 50], - [ u'Iris-virginica', + ['Iris-setosa', 50], + [ 'Iris-virginica', 50]], - u'missing_count': 0}}} + 'missing_count': 0}}} The field filtering options are also available using a query string expression, @@ -132,965 +151,422 @@ for instance: limits the number of fields that will be included in ``dataset`` to 20. You can check the dataset properties at the `API documentation -`_. +`_. -Model ------ - -One of the greatest things about BigML is that the models that it -generates for you are fully white-boxed. To get the explicit tree-like -predictive model for the example above: +Samples +~~~~~~~ -.. code-block:: python +To provide quick access to your row data you can create a ``sample``. Samples +are in-memory objects that can be queried for subsets of data by limiting +their size, the fields or the rows returned. The structure of a sample would +be: - >>> model = api.get_model(model) - >>> api.pprint(model['object']['model']['root']) - {u'children': [ - {u'children': [ - {u'children': [{u'count': 38, - u'distribution': [[u'Iris-virginica', 38]], - u'output': u'Iris-virginica', - u'predicate': {u'field': u'000002', - u'operator': u'>', - u'value': 5.05}}, - u'children': [ - [ ... ] +.. code-block:: python - {u'count': 50, - u'distribution': [[u'Iris-setosa', 50]], - u'output': u'Iris-setosa', - u'predicate': {u'field': u'000002', - u'operator': u'<=', - u'value': 2.45}}]}, - {u'count': 150, - u'distribution': [[u'Iris-virginica', 50], - [u'Iris-versicolor', 50], - [u'Iris-setosa', 50]], - u'output': u'Iris-virginica', - u'predicate': True}]}}} + >>> from bigml.api import BigML + >>> api = BigML() + >>> sample = api.create_sample('dataset/55b7a6749841fa2500000d41', + {"max_rows": 150}) + >>> api.ok(sample) + >>> api.pprint(sample['object']) + { + "category": 0, + "code": 201, + "columns": 0, + "configuration": null, + "configuration_status": false, + "created": "2021-03-02T14:32:59.603699", + "creator": "alfred", + "dataset": "dataset/603e20a91f386f43db000004", + "dataset_status": true, + "description": "", + "excluded_fields": [], + "fields_meta": { + "count": 0, + "limit": 1000, + "offset": 0, + "total": 0 + }, + "input_fields": [ + "000000", + "000001", + "000002", + "000003", + "000004" + ], + "locale": "en_US", + "max_columns": 0, + "max_rows": 150, + "name": "iris", + "name_options": "", + "private": true, + "project": null, + "resource": "sample/603e4c9b1f386fdea6000000", + "rows": 0, + "seed": "d1dc0a2819344a079af521507b7e7ea8", + "shared": false, + "size": 4608, + "status": { + "code": 1, + "message": "The sample creation request has been queued and will be processed soon", + "progress": 0 + }, + "subscription": true, + "tags": [], + "type": 0, + "updated": "2021-03-02T14:32:59.603751" + } -(Note that we have abbreviated the output in the snippet above for -readability: the full predictive model you'll get is going to contain -much more details). -Again, filtering options are also available using a query string expression, -for instance: +Samples are not permanent objects. Once they are created, they will be +available as long as GETs are requested within periods smaller than +a pre-established TTL (Time to Live). The expiration timer of a sample is +reset every time a new GET is received. -.. code-block:: python +If requested, a sample can also perform linear regression and compute +Pearson's and Spearman's correlations for either one numeric field +against all other numeric fields or between two specific numeric fields. - >>> model = api.get_model(model, "limit=5") +You can check the sample properties at the `API documentation +`_. -limits the number of fields that will be included in ``model`` to 5. +Correlations +~~~~~~~~~~~~ -You can check the model properties at the `API documentation -`_. +A ``correlation`` resource contains a series of computations that reflect the +degree of dependence between the field set as objective for your predictions +and the rest of fields in your dataset. The dependence degree is obtained by +comparing the distributions in every objective and non-objective field pair, +as independent fields should have probabilistic +independent distributions. Depending on the types of the fields to compare, +the metrics used to compute the correlation degree will be: -Evaluation ----------- +- for numeric to numeric pairs: + `Pearson's `_ + and `Spearman's correlation `_ + coefficients. +- for numeric to categorical pairs: + `One-way Analysis of Variance `_, with the + categorical field as the predictor variable. +- for categorical to categorical pairs: + `contingency table (or two-way table) `_, + `Chi-square test of independence `_ + , and `Cramer's V `_ + and `Tschuprow's T `_ coefficients. -The predictive performance of a model can be measured using many different -measures. In BigML these measures can be obtained by creating evaluations. To -create an evaluation you need the id of the model you are evaluating and the id -of the dataset that contains the data to be tested with. The result is shown -as: +An example of the correlation resource JSON structure is: .. code-block:: python - >>> evaluation = api.get_evaluation(evaluation) - >>> api.pprint(evaluation['object']['result']) - { 'class_names': ['0', '1'], - 'mode': { 'accuracy': 0.9802, - 'average_f_measure': 0.495, - 'average_phi': 0, - 'average_precision': 0.5, - 'average_recall': 0.4901, - 'confusion_matrix': [[99, 0], [2, 0]], - 'per_class_statistics': [ { 'accuracy': 0.9801980198019802, - 'class_name': '0', - 'f_measure': 0.99, - 'phi_coefficient': 0, - 'precision': 1.0, - 'present_in_test_data': True, - 'recall': 0.9801980198019802}, - { 'accuracy': 0.9801980198019802, - 'class_name': '1', - 'f_measure': 0, - 'phi_coefficient': 0, - 'precision': 0.0, - 'present_in_test_data': True, - 'recall': 0}]}, - 'model': { 'accuracy': 0.9901, - 'average_f_measure': 0.89746, - 'average_phi': 0.81236, - 'average_precision': 0.99495, - 'average_recall': 0.83333, - 'confusion_matrix': [[98, 1], [0, 2]], - 'per_class_statistics': [ { 'accuracy': 0.9900990099009901, - 'class_name': '0', - 'f_measure': 0.9949238578680203, - 'phi_coefficient': 0.8123623944599232, - 'precision': 0.98989898989899, - 'present_in_test_data': True, - 'recall': 1.0}, - { 'accuracy': 0.9900990099009901, - 'class_name': '1', - 'f_measure': 0.8, - 'phi_coefficient': 0.8123623944599232, - 'precision': 1.0, - 'present_in_test_data': True, - 'recall': 0.6666666666666666}]}, - 'random': { 'accuracy': 0.50495, - 'average_f_measure': 0.36812, - 'average_phi': 0.13797, - 'average_precision': 0.74747, - 'average_recall': 0.51923, - 'confusion_matrix': [[49, 50], [0, 2]], - 'per_class_statistics': [ { 'accuracy': 0.504950495049505, - 'class_name': '0', - 'f_measure': 0.6621621621621622, - 'phi_coefficient': 0.1379728923974526, - 'precision': 0.494949494949495, - 'present_in_test_data': True, - 'recall': 1.0}, - { 'accuracy': 0.504950495049505, - 'class_name': '1', - 'f_measure': 0.07407407407407407, - 'phi_coefficient': 0.1379728923974526, - 'precision': 1.0, - 'present_in_test_data': True, - 'recall': 0.038461538461538464}]}} + >>> from bigml.api import BigML + >>> api = BigML() + >>> correlation = api.create_correlation('dataset/55b7a6749841fa2500000d41') + >>> api.ok(correlation) + >>> api.pprint(correlation['object']) + { 'category': 0, + 'clones': 0, + 'code': 200, + 'columns': 5, + 'correlations': { 'correlations': [ { 'name': 'one_way_anova', + 'result': { '000000': { 'eta_square': 0.61871, + 'f_ratio': 119.2645, + 'p_value': 0, + 'significant': [ True, + True, + True]}, + '000001': { 'eta_square': 0.40078, + 'f_ratio': 49.16004, + 'p_value': 0, + 'significant': [ True, + True, + True]}, + '000002': { 'eta_square': 0.94137, + 'f_ratio': 1180.16118, + 'p_value': 0, + 'significant': [ True, + True, + True]}, + '000003': { 'eta_square': 0.92888, + 'f_ratio': 960.00715, + 'p_value': 0, + 'significant': [ True, + True, + True]}}}], + 'fields': { '000000': { 'column_number': 0, + 'datatype': 'double', + 'idx': 0, + 'name': 'sepal length', + 'optype': 'numeric', + 'order': 0, + 'preferred': True, + 'summary': { 'bins': [ [ 4.3, + 1], + [ 4.425, + 4], + ... + [ 7.9, + 1]], + 'kurtosis': -0.57357, + 'maximum': 7.9, + 'mean': 5.84333, + 'median': 5.8, + 'minimum': 4.3, + 'missing_count': 0, + 'population': 150, + 'skewness': 0.31175, + 'splits': [ 4.51526, + 4.67252, + 4.81113, + 4.89582, + 4.96139, + 5.01131, + ... + 6.92597, + 7.20423, + 7.64746], + 'standard_deviation': 0.82807, + 'sum': 876.5, + 'sum_squares': 5223.85, + 'variance': 0.68569}}, + '000001': { 'column_number': 1, + 'datatype': 'double', + 'idx': 1, + 'name': 'sepal width', + 'optype': 'numeric', + 'order': 1, + 'preferred': True, + 'summary': { 'counts': [ [ 2, + 1], + [ 2.2, + ... + '000004': { 'column_number': 4, + 'datatype': 'string', + 'idx': 4, + 'name': 'species', + 'optype': 'categorical', + 'order': 4, + 'preferred': True, + 'summary': { 'categories': [ [ 'Iris-setosa', + 50], + [ 'Iris-versicolor', + 50], + [ 'Iris-virginica', + 50]], + 'missing_count': 0}, + 'term_analysis': { 'enabled': True}}}, + 'significance_levels': [0.01, 0.05, 0.1]}, + 'created': '2015-07-28T18:07:37.010000', + 'credits': 0.017581939697265625, + 'dataset': 'dataset/55b7a6749841fa2500000d41', + 'dataset_status': True, + 'dataset_type': 0, + 'description': '', + 'excluded_fields': [], + 'fields_meta': { 'count': 5, + 'limit': 1000, + 'offset': 0, + 'query_total': 5, + 'total': 5}, + 'input_fields': ['000000', '000001', '000002', '000003'], + 'locale': 'en_US', + 'max_columns': 5, + 'max_rows': 150, + 'name': u"iris' dataset correlation", + 'objective_field_details': { 'column_number': 4, + 'datatype': 'string', + 'name': 'species', + 'optype': 'categorical', + 'order': 4}, + 'out_of_bag': False, + 'price': 0.0, + 'private': True, + 'project': None, + 'range': [1, 150], + 'replacement': False, + 'resource': 'correlation/55b7c4e99841fa24f20009bf', + 'rows': 150, + 'sample_rate': 1.0, + 'shared': False, + 'size': 4609, + 'source': 'source/55b7a6729841fa24f100036a', + 'source_status': True, + 'status': { 'code': 5, + 'elapsed': 274, + 'message': 'The correlation has been created', + 'progress': 1.0}, + 'subscription': True, + 'tags': [], + 'updated': '2015-07-28T18:07:49.057000', + 'white_box': False} -where two levels of detail are easily identified. For classifications, -the first level shows these keys: +Note that the output in the snippet above has been abbreviated. As you see, the +``correlations`` attribute contains the information about each field +correlation to the objective field. -- **class_names**: A list with the names of all the categories for the objective field (i.e., all the classes) -- **mode**: A detailed result object. Measures of the performance of the classifier that predicts the mode class for all the instances in the dataset -- **model**: A detailed result object. -- **random**: A detailed result object. Measures the performance of the classifier that predicts a random class for all the instances in the dataset. +You can check the correlations properties at the `API documentation +`_. -and the detailed result objects include ``accuracy``, ``average_f_measure``, ``average_phi``, -``average_precision``, ``average_recall``, ``confusion_matrix`` -and ``per_class_statistics``. -For regressions first level will contain these keys: +Statistical Tests +~~~~~~~~~~~~~~~~~ -- **mean**: A detailed result object. Measures the performance of the model that predicts the mean for all the instances in the dataset. -- **model**: A detailed result object. -- **random**: A detailed result object. Measures the performance of the model that predicts a random class for all the instances in the dataset. +A ``statisticaltest`` resource contains a series of tests +that compare the +distribution of data in each numeric field of a dataset +to certain canonical distributions, +such as the +`normal distribution `_ +or `Benford's law `_ +distribution. Statistical test are useful in tasks such as fraud, normality, +or outlier detection. -where the detailed result objects include ``mean_absolute_error``, -``mean_squared_error`` and ``r_squared`` (refer to -`developers documentation `_ for -more info on the meaning of these measures. +- Fraud Detection Tests: +Benford: This statistical test performs a comparison of the distribution of +first significant digits (FSDs) of each value of the field to the Benford's +law distribution. Benford's law applies to numerical distributions spanning +several orders of magnitude, such as the values found on financial balance +sheets. It states that the frequency distribution of leading, or first +significant digits (FSD) in such distributions is not uniform. +On the contrary, lower digits like 1 and 2 occur disproportionately +often as leading significant digits. The test compares the distribution +in the field to Bendford's distribution using a Chi-square goodness-of-fit +test, and Cho-Gaines d test. If a field has a dissimilar distribution, +it may contain anomalous or fraudulent values. -You can check the evaluation properties at the `API documentation -`_. +- Normality tests: +These tests can be used to confirm the assumption that the data in each field +of a dataset is distributed according to a normal distribution. The results +are relevant because many statistical and machine learning techniques rely on +this assumption. +Anderson-Darling: The Anderson-Darling test computes a test statistic based on +the difference between the observed cumulative distribution function (CDF) to +that of a normal distribution. A significant result indicates that the +assumption of normality is rejected. +Jarque-Bera: The Jarque-Bera test computes a test statistic based on the third +and fourth central moments (skewness and kurtosis) of the data. Again, a +significant result indicates that the normality assumption is rejected. +Z-score: For a given sample size, the maximum deviation from the mean that +would expected in a sampling of a normal distribution can be computed based +on the 68-95-99.7 rule. This test simply reports this expected deviation and +the actual deviation observed in the data, as a sort of sanity check. -Cluster -------- +- Outlier tests: +Grubbs: When the values of a field are normally distributed, a few values may +still deviate from the mean distribution. The outlier tests reports whether +at least one value in each numeric field differs significantly from the mean +using Grubb's test for outliers. If an outlier is found, then its value will +be returned. -For unsupervised learning problems, the cluster is used to classify in a -limited number of groups your training data. The cluster structure is defined -by the centers of each group of data, named centroids, and the data enclosed -in the group. As for in the model's case, the cluster is a white-box resource -and can be retrieved as a JSON: +The JSON structure for ``statisticaltest`` resources is similar to this one: .. code-block:: python - >>> cluster = api.get_cluster(cluster) - >>> api.pprint(cluster['object']) - { 'balance_fields': True, - 'category': 0, - 'cluster_datasets': { '000000': '', '000001': '', '000002': ''}, - 'cluster_datasets_ids': { '000000': '53739b9ae4b0dad82b0a65e6', - '000001': '53739b9ae4b0dad82b0a65e7', - '000002': '53739b9ae4b0dad82b0a65e8'}, - 'cluster_seed': '2c249dda00fbf54ab4cdd850532a584f286af5b6', - 'clusters': { 'clusters': [ { 'center': { '000000': 58.5, - '000001': 26.8314, - '000002': 44.27907, - '000003': 14.37209}, - 'count': 56, - 'distance': { 'bins': [ [ 0.69602, - 2], - [ ... ] - [ 3.77052, - 1]], - 'maximum': 3.77052, - 'mean': 1.61711, - 'median': 1.52146, - 'minimum': 0.69237, - 'population': 56, - 'standard_deviation': 0.6161, - 'sum': 90.55805, - 'sum_squares': 167.31926, - 'variance': 0.37958}, - 'id': '000000', - 'name': 'Cluster 0'}, - { 'center': { '000000': 50.06, - '000001': 34.28, - '000002': 14.62, - '000003': 2.46}, - 'count': 50, - 'distance': { 'bins': [ [ 0.16917, - 1], - [ ... ] - [ 4.94699, - 1]], - 'maximum': 4.94699, - 'mean': 1.50725, - 'median': 1.3393, - 'minimum': 0.16917, - 'population': 50, - 'standard_deviation': 1.00994, - 'sum': 75.36252, - 'sum_squares': 163.56918, - 'variance': 1.01998}, - 'id': '000001', - 'name': 'Cluster 1'}, - { 'center': { '000000': 68.15625, - '000001': 31.25781, - '000002': 55.48438, - '000003': 19.96875}, - 'count': 44, - 'distance': { 'bins': [ [ 0.36825, - 1], - [ ... ] - [ 3.87216, - 1]], - 'maximum': 3.87216, - 'mean': 1.67264, - 'median': 1.63705, - 'minimum': 0.36825, - 'population': 44, - 'standard_deviation': 0.78905, - 'sum': 73.59627, - 'sum_squares': 149.87194, - 'variance': 0.6226}, - 'id': '000002', - 'name': 'Cluster 2'}], - 'fields': { '000000': { 'column_number': 0, - 'datatype': 'int8', - 'name': 'sepal length', - 'optype': 'numeric', - 'order': 0, - 'preferred': True, - 'summary': { 'bins': [ [ 43.75, - 4], - [ ... ] - [ 79, - 1]], - 'maximum': 79, - 'mean': 58.43333, - 'median': 57.7889, - 'minimum': 43, - 'missing_count': 0, - 'population': 150, - 'splits': [ 45.15258, - 46.72525, - 72.04226, - 76.47461], - 'standard_deviation': 8.28066, - 'sum': 8765, - 'sum_squares': 522385, - 'variance': 68.56935}}, - [ ... ] - [ 25, - 3]], - 'maximum': 25, - 'mean': 11.99333, - 'median': 13.28483, - 'minimum': 1, - 'missing_count': 0, - 'population': 150, - 'standard_deviation': 7.62238, - 'sum': 1799, - 'sum_squares': 30233, - 'variance': 58.10063}}}}, - 'code': 202, - 'columns': 4, - 'created': '2014-05-14T16:36:40.993000', - 'credits': 0.017578125, - 'credits_per_prediction': 0.0, - 'dataset': 'dataset/53739b88c8db63122b000411', - 'dataset_field_types': { 'categorical': 1, - 'datetime': 0, - 'numeric': 4, - 'preferred': 5, - 'text': 0, - 'total': 5}, + >>> statistical_test = api.create_statistical_test('dataset/55b7a6749841fa2500000d41') + >>> api.ok(statistical_test) + True + >>> api.pprint(statistical_test['object']) + { 'category': 0, + 'clones': 0, + 'code': 200, + 'columns': 5, + 'created': '2015-07-28T18:16:40.582000', + 'credits': 0.017581939697265625, + 'dataset': 'dataset/55b7a6749841fa2500000d41', 'dataset_status': True, 'dataset_type': 0, 'description': '', - 'excluded_fields': ['000004'], - 'field_scales': None, - 'fields_meta': { 'count': 4, - 'limit': 1000, - 'offset': 0, - 'query_total': 4, - 'total': 4}, + 'excluded_fields': [], + 'fields_meta': { 'count': 5, + 'limit': 1000, + 'offset': 0, + 'query_total': 5, + 'total': 5}, 'input_fields': ['000000', '000001', '000002', '000003'], - 'k': 3, - 'locale': 'es-ES', + 'locale': 'en_US', 'max_columns': 5, 'max_rows': 150, - 'name': 'my iris', - 'number_of_batchcentroids': 0, - 'number_of_centroids': 0, - 'number_of_public_centroids': 0, + 'name': u"iris' dataset test", 'out_of_bag': False, 'price': 0.0, 'private': True, + 'project': None, 'range': [1, 150], 'replacement': False, - 'resource': 'cluster/53739b98d994972da7001de9', + 'resource': 'statisticaltest/55b7c7089841fa25000010ad', 'rows': 150, 'sample_rate': 1.0, - 'scales': { '000000': 0.22445382597655375, - '000001': 0.4264213814821549, - '000002': 0.10528680248949522, - '000003': 0.2438379900517961}, 'shared': False, - 'size': 4608, - 'source': 'source/53739b24d994972da7001ddd', + 'size': 4609, + 'source': 'source/55b7a6729841fa24f100036a', 'source_status': True, 'status': { 'code': 5, - 'elapsed': 1009, - 'message': 'The cluster has been created', - 'progress': 1.0}, + 'elapsed': 302, + 'message': 'The test has been created', + 'progress': 1.0}, 'subscription': True, 'tags': [], - 'updated': '2014-05-14T16:40:26.234728', - 'white_box': False} - -(Note that we have abbreviated the output in the snippet above for -readability: the full predictive cluster you'll get is going to contain -much more details). - -You can check the cluster properties at the `API documentation -`_. - -Anomaly detector ----------------- - -For anomaly detection problems, BigML anomaly detector uses iforest as an -unsupervised kind of model that detects anomalous data in a dataset. The -information it returns encloses a `top_anomalies` block -that contains a list of the most anomalous -points. For each, we capture a `score` from 0 to 1. The closer to 1, -the more anomalous. We also capture the `row` which gives values for -each field in the order defined by `input_fields`. Similarly we give -a list of `importances` which match the `row` values. These -importances tell us which values contributed most to the anomaly -score. Thus, the structure of an anomaly detector is similar to: - -.. code-block:: python - - { 'category': 0, - 'code': 200, - 'columns': 14, - 'constraints': False, - 'created': '2014-09-08T18:51:11.893000', - 'credits': 0.11653518676757812, - 'credits_per_prediction': 0.0, - 'dataset': 'dataset/540dfa9d9841fa5c88000765', - 'dataset_field_types': { 'categorical': 21, - 'datetime': 0, - 'numeric': 21, - 'preferred': 14, - 'text': 0, - 'total': 42}, - 'dataset_status': True, - 'dataset_type': 0, - 'description': '', - 'excluded_fields': [], - 'fields_meta': { 'count': 14, - 'limit': 1000, - 'offset': 0, - 'query_total': 14, - 'total': 14}, - 'forest_size': 128, - 'input_fields': [ '000004', - '000005', - '000009', - '000016', - '000017', - '000018', - '000019', - '00001e', - '00001f', - '000020', - '000023', - '000024', - '000025', - '000026'], - 'locale': 'en_US', - 'max_columns': 42, - 'max_rows': 200, - 'model': { 'fields': { '000004': { 'column_number': 4, - 'datatype': 'int16', - 'name': 'src_bytes', - 'optype': 'numeric', - 'order': 0, - 'preferred': True, - 'summary': { 'bins': [ [ 143, - 2], - ... - [ 370, - 2]], - 'maximum': 370, - 'mean': 248.235, - 'median': 234.57157, - 'minimum': 141, - 'missing_count': 0, - 'population': 200, - 'splits': [ 159.92462, - 173.73312, - 188, - ... - 339.55228], - 'standard_deviation': 49.39869, - 'sum': 49647, - 'sum_squares': 12809729, - 'variance': 2440.23093}}, - '000005': { 'column_number': 5, - 'datatype': 'int32', - 'name': 'dst_bytes', - 'optype': 'numeric', - 'order': 1, - 'preferred': True, - ... - 'sum': 1030851, - 'sum_squares': 22764504759, - 'variance': 87694652.45224}}, - '000009': { 'column_number': 9, - 'datatype': 'string', - 'name': 'hot', - 'optype': 'categorical', - 'order': 2, - 'preferred': True, - 'summary': { 'categories': [ [ '0', - 199], - [ '1', - 1]], - 'missing_count': 0}, - 'term_analysis': { 'enabled': True}}, - '000016': { 'column_number': 22, - 'datatype': 'int8', - 'name': 'count', - 'optype': 'numeric', - 'order': 3, - 'preferred': True, - ... - 'population': 200, - 'standard_deviation': 5.42421, - 'sum': 1351, - 'sum_squares': 14981, - 'variance': 29.42209}}, - '000017': { ... }}}, - 'kind': 'iforest', - 'mean_depth': 12.314174107142858, - 'top_anomalies': [ { 'importance': [ 0.06768, - 0.01667, - 0.00081, - 0.02437, - 0.04773, - 0.22197, - 0.18208, - 0.01868, - 0.11855, - 0.01983, - 0.01898, - 0.05306, - 0.20398, - 0.00562], - 'row': [ 183.0, - 8654.0, - '0', - 4.0, - 4.0, - 0.25, - 0.25, - 0.0, - 123.0, - 255.0, - 0.01, - 0.04, - 0.01, - 0.0], - 'score': 0.68782}, - { 'importance': [ 0.05645, - 0.02285, - 0.0015, - 0.05196, - 0.04435, - 0.0005, - 0.00056, - 0.18979, - 0.12402, - 0.23671, - 0.20723, - 0.05651, - 0.00144, - 0.00612], - 'row': [ 212.0, - 1940.0, - '0', - 1.0, - 2.0, - 0.0, - 0.0, - 1.0, - 1.0, - 69.0, - 1.0, - 0.04, - 0.0, - 0.0], - 'score': 0.6239}, - ...], - 'trees': [ { 'root': { 'children': [ { 'children': [ { 'children': [ { 'children': [ { 'children': - [ { 'population': 1, - 'predicates': [ { 'field': '00001f', - 'op': '>', - 'value': 35.54357}]}, - - ... - { 'population': 1, - 'predicates': [ { 'field': '00001f', - 'op': '<=', - 'value': 35.54357}]}], - 'population': 2, - 'predicates': [ { 'field': '000005', - 'op': '<=', - 'value': 1385.5166}]}], - 'population': 3, - 'predicates': [ { 'field': '000020', - 'op': '<=', - 'value': 65.14308}, - { 'field': '000019', - 'op': '=', - 'value': 0}]}], - 'population': 105, - 'predicates': [ { 'field': '000017', - 'op': '<=', - 'value': 13.21754}, - { 'field': '000009', - 'op': 'in', - 'value': [ '0']}]}], - 'population': 126, - 'predicates': [ True, - { 'field': '000018', - 'op': '=', - 'value': 0}]}, - 'training_mean_depth': 11.071428571428571}]}, - 'name': "tiny_kdd's dataset anomaly detector", - 'number_of_batchscores': 0, - 'number_of_public_predictions': 0, - 'number_of_scores': 0, - 'out_of_bag': False, - 'price': 0.0, - 'private': True, - 'project': None, - 'range': [1, 200], - 'replacement': False, - 'resource': 'anomaly/540dfa9f9841fa5c8800076a', - 'rows': 200, - 'sample_rate': 1.0, - 'sample_size': 126, - 'seed': 'BigML', - 'shared': False, - 'size': 30549, - 'source': 'source/540dfa979841fa5c7f000363', - 'source_status': True, - 'status': { 'code': 5, - 'elapsed': 32397, - 'message': 'The anomaly detector has been created', - 'progress': 1.0}, - 'subscription': False, - 'tags': [], - 'updated': '2014-09-08T23:54:28.647000', - 'white_box': False} - -Note that we have abbreviated the output in the snippet above for -readability: the full anomaly detector you'll get is going to contain -much more details). - -The `trees` list contains the actual isolation forest, and it can be quite -large usually. That's why, this part of the resource should only be included -in downloads when needed. If you are only interested in other properties, such -as `top_anomalies`, you'll improve performance by excluding it, using the -`excluded=trees` query string in the API call: - -.. code-block:: python - - anomaly = api.get_anomaly('anomaly/540dfa9f9841fa5c8800076a', \ - query_string='excluded=trees') - -Each node in an isolation tree can have multiple predicates. -For the node to be a valid branch when evaluated with a data point, all of its -predicates must be true. - -You can check the anomaly detector properties at the `API documentation -`_. - -Samples -------- - -To provide quick access to your row data you can create a ``sample``. Samples -are in-memory objects that can be queried for subsets of data by limiting -their size, the fields or the rows returned. The structure of a sample would -be:: - -Samples are not permanent objects. Once they are created, they will be -available as long as GETs are requested within periods smaller than -a pre-established TTL (Time to Live). The expiration timer of a sample is -reset every time a new GET is received. - -If requested, a sample can also perform linear regression and compute -Pearson's and Spearman's correlations for either one numeric field -against all other numeric fields or between two specific numeric fields. - -You can check the sample properties at the `API documentation -`_. - -Correlations ------------- - -A ``correlation`` resource contains a series of computations that reflect the -degree of dependence between the field set as objective for your predictions -and the rest of fields in your dataset. The dependence degree is obtained by -comparing the distributions in every objective and non-objective field pair, -as independent fields should have probabilistic -independent distributions. Depending on the types of the fields to compare, -the metrics used to compute the correlation degree will be: - -- for numeric to numeric pairs: - `Pearson's `_ - and `Spearman's correlation `_ - coefficients. -- for numeric to categorical pairs: - `One-way Analysis of Variance `_, with the - categorical field as the predictor variable. -- for categorical to categorical pairs: - `contingency table (or two-way table) `_, - `Chi-square test of independence `_ - , and `Cramer's V `_ - and `Tschuprow's T `_ coefficients. - -An example of the correlation resource JSON structure is: - -.. code-block:: python - - >>> from bigml.api import BigML - >>> api = BigML() - >>> correlation = api.create_correlation('dataset/55b7a6749841fa2500000d41') - >>> api.ok(correlation) - >>> api.pprint(correlation['object']) - { u'category': 0, - u'clones': 0, - u'code': 200, - u'columns': 5, - u'correlations': { u'correlations': [ { u'name': u'one_way_anova', - u'result': { u'000000': { u'eta_square': 0.61871, - u'f_ratio': 119.2645, - u'p_value': 0, - u'significant': [ True, - True, - True]}, - u'000001': { u'eta_square': 0.40078, - u'f_ratio': 49.16004, - u'p_value': 0, - u'significant': [ True, - True, - True]}, - u'000002': { u'eta_square': 0.94137, - u'f_ratio': 1180.16118, - u'p_value': 0, - u'significant': [ True, - True, - True]}, - u'000003': { u'eta_square': 0.92888, - u'f_ratio': 960.00715, - u'p_value': 0, - u'significant': [ True, - True, - True]}}}], - u'fields': { u'000000': { u'column_number': 0, - u'datatype': u'double', - u'idx': 0, - u'name': u'sepal length', - u'optype': u'numeric', - u'order': 0, - u'preferred': True, - u'summary': { u'bins': [ [ 4.3, - 1], - [ 4.425, - 4], - ... - [ 7.9, - 1]], - u'kurtosis': -0.57357, - u'maximum': 7.9, - u'mean': 5.84333, - u'median': 5.8, - u'minimum': 4.3, - u'missing_count': 0, - u'population': 150, - u'skewness': 0.31175, - u'splits': [ 4.51526, - 4.67252, - 4.81113, - 4.89582, - 4.96139, - 5.01131, - ... - 6.92597, - 7.20423, - 7.64746], - u'standard_deviation': 0.82807, - u'sum': 876.5, - u'sum_squares': 5223.85, - u'variance': 0.68569}}, - u'000001': { u'column_number': 1, - u'datatype': u'double', - u'idx': 1, - u'name': u'sepal width', - u'optype': u'numeric', - u'order': 1, - u'preferred': True, - u'summary': { u'counts': [ [ 2, - 1], - [ 2.2, - ... - u'000004': { u'column_number': 4, - u'datatype': u'string', - u'idx': 4, - u'name': u'species', - u'optype': u'categorical', - u'order': 4, - u'preferred': True, - u'summary': { u'categories': [ [ u'Iris-setosa', - 50], - [ u'Iris-versicolor', - 50], - [ u'Iris-virginica', - 50]], - u'missing_count': 0}, - u'term_analysis': { u'enabled': True}}}, - u'significance_levels': [0.01, 0.05, 0.1]}, - u'created': u'2015-07-28T18:07:37.010000', - u'credits': 0.017581939697265625, - u'dataset': u'dataset/55b7a6749841fa2500000d41', - u'dataset_status': True, - u'dataset_type': 0, - u'description': u'', - u'excluded_fields': [], - u'fields_meta': { u'count': 5, - u'limit': 1000, - u'offset': 0, - u'query_total': 5, - u'total': 5}, - u'input_fields': [u'000000', u'000001', u'000002', u'000003'], - u'locale': u'en_US', - u'max_columns': 5, - u'max_rows': 150, - u'name': u"iris' dataset correlation", - u'objective_field_details': { u'column_number': 4, - u'datatype': u'string', - u'name': u'species', - u'optype': u'categorical', - u'order': 4}, - u'out_of_bag': False, - u'price': 0.0, - u'private': True, - u'project': None, - u'range': [1, 150], - u'replacement': False, - u'resource': u'correlation/55b7c4e99841fa24f20009bf', - u'rows': 150, - u'sample_rate': 1.0, - u'shared': False, - u'size': 4609, - u'source': u'source/55b7a6729841fa24f100036a', - u'source_status': True, - u'status': { u'code': 5, - u'elapsed': 274, - u'message': u'The correlation has been created', - u'progress': 1.0}, - u'subscription': True, - u'tags': [], - u'updated': u'2015-07-28T18:07:49.057000', - u'white_box': False} - -Note that the output in the snippet above has been abbreviated. As you see, the -``correlations`` attribute contains the information about each field -correlation to the objective field. - -You can check the correlations properties at the `API documentation -`_. - - -Statistical Tests ------------------ - -A ``statisticaltest`` resource contains a series of tests -that compare the -distribution of data in each numeric field of a dataset -to certain canonical distributions, -such as the -`normal distribution `_ -or `Benford's law `_ -distribution. Statistical test are useful in tasks such as fraud, normality, -or outlier detection. - -- Fraud Detection Tests: -Benford: This statistical test performs a comparison of the distribution of -first significant digits (FSDs) of each value of the field to the Benford's -law distribution. Benford's law applies to numerical distributions spanning -several orders of magnitude, such as the values found on financial balance -sheets. It states that the frequency distribution of leading, or first -significant digits (FSD) in such distributions is not uniform. -On the contrary, lower digits like 1 and 2 occur disproportionately -often as leading significant digits. The test compares the distribution -in the field to Bendford's distribution using a Chi-square goodness-of-fit -test, and Cho-Gaines d test. If a field has a dissimilar distribution, -it may contain anomalous or fraudulent values. - -- Normality tests: -These tests can be used to confirm the assumption that the data in each field -of a dataset is distributed according to a normal distribution. The results -are relevant because many statistical and machine learning techniques rely on -this assumption. -Anderson-Darling: The Anderson-Darling test computes a test statistic based on -the difference between the observed cumulative distribution function (CDF) to -that of a normal distribution. A significant result indicates that the -assumption of normality is rejected. -Jarque-Bera: The Jarque-Bera test computes a test statistic based on the third -and fourth central moments (skewness and kurtosis) of the data. Again, a -significant result indicates that the normality assumption is rejected. -Z-score: For a given sample size, the maximum deviation from the mean that -would expected in a sampling of a normal distribution can be computed based -on the 68-95-99.7 rule. This test simply reports this expected deviation and -the actual deviation observed in the data, as a sort of sanity check. - -- Outlier tests: -Grubbs: When the values of a field are normally distributed, a few values may -still deviate from the mean distribution. The outlier tests reports whether -at least one value in each numeric field differs significantly from the mean -using Grubb's test for outliers. If an outlier is found, then its value will -be returned. - -The JSON structure for ``statisticaltest`` resources is similar to this one: - -.. code-block:: python - - >>> statistical_test = api.create_statistical_test('dataset/55b7a6749841fa2500000d41') - >>> api.ok(statistical_test) - True - >>> api.pprint(statistical_test['object']) - { u'category': 0, - u'clones': 0, - u'code': 200, - u'columns': 5, - u'created': u'2015-07-28T18:16:40.582000', - u'credits': 0.017581939697265625, - u'dataset': u'dataset/55b7a6749841fa2500000d41', - u'dataset_status': True, - u'dataset_type': 0, - u'description': u'', - u'excluded_fields': [], - u'fields_meta': { u'count': 5, - u'limit': 1000, - u'offset': 0, - u'query_total': 5, - u'total': 5}, - u'input_fields': [u'000000', u'000001', u'000002', u'000003'], - u'locale': u'en_US', - u'max_columns': 5, - u'max_rows': 150, - u'name': u"iris' dataset test", - u'out_of_bag': False, - u'price': 0.0, - u'private': True, - u'project': None, - u'range': [1, 150], - u'replacement': False, - u'resource': u'statisticaltest/55b7c7089841fa25000010ad', - u'rows': 150, - u'sample_rate': 1.0, - u'shared': False, - u'size': 4609, - u'source': u'source/55b7a6729841fa24f100036a', - u'source_status': True, - u'status': { u'code': 5, - u'elapsed': 302, - u'message': u'The test has been created', - u'progress': 1.0}, - u'subscription': True, - u'tags': [], - u'statistical_tests': { u'ad_sample_size': 1024, - u'fields': { u'000000': { u'column_number': 0, - u'datatype': u'double', - u'idx': 0, - u'name': u'sepal length', - u'optype': u'numeric', - u'order': 0, - u'preferred': True, - u'summary': { u'bins': [ [ 4.3, - 1], - [ 4.425, - 4], + 'statistical_tests': { 'ad_sample_size': 1024, + 'fields': { '000000': { 'column_number': 0, + 'datatype': 'double', + 'idx': 0, + 'name': 'sepal length', + 'optype': 'numeric', + 'order': 0, + 'preferred': True, + 'summary': { 'bins': [ [ 4.3, + 1], + [ 4.425, + 4], ... [ 7.9, 1]], - u'kurtosis': -0.57357, - u'maximum': 7.9, - u'mean': 5.84333, - u'median': 5.8, - u'minimum': 4.3, - u'missing_count': 0, - u'population': 150, - u'skewness': 0.31175, - u'splits': [ 4.51526, + 'kurtosis': -0.57357, + 'maximum': 7.9, + 'mean': 5.84333, + 'median': 5.8, + 'minimum': 4.3, + 'missing_count': 0, + 'population': 150, + 'skewness': 0.31175, + 'splits': [ 4.51526, 4.67252, 4.81113, 4.89582, ... 7.20423, 7.64746], - u'standard_deviation': 0.82807, - u'sum': 876.5, - u'sum_squares': 5223.85, - u'variance': 0.68569}}, + 'standard_deviation': 0.82807, + 'sum': 876.5, + 'sum_squares': 5223.85, + 'variance': 0.68569}}, ... - u'000004': { u'column_number': 4, - u'datatype': u'string', - u'idx': 4, - u'name': u'species', - u'optype': u'categorical', - u'order': 4, - u'preferred': True, - u'summary': { u'categories': [ [ u'Iris-setosa', + '000004': { 'column_number': 4, + 'datatype': 'string', + 'idx': 4, + 'name': 'species', + 'optype': 'categorical', + 'order': 4, + 'preferred': True, + 'summary': { 'categories': [ [ 'Iris-setosa', 50], - [ u'Iris-versicolor', + [ 'Iris-versicolor', 50], - [ u'Iris-virginica', + [ 'Iris-virginica', 50]], - u'missing_count': 0}, - u'term_analysis': { u'enabled': True}}}, - u'fraud': [ { u'name': u'benford', - u'result': { u'000000': { u'chi_square': { u'chi_square_value': 506.39302, - u'p_value': 0, - u'significant': [ True, + 'missing_count': 0}, + 'term_analysis': { 'enabled': True}}}, + 'fraud': [ { 'name': 'benford', + 'result': { '000000': { 'chi_square': { 'chi_square_value': 506.39302, + 'p_value': 0, + 'significant': [ True, True, True]}, - u'cho_gaines': { u'd_statistic': 7.124311073683573, - u'significant': [ True, + 'cho_gaines': { 'd_statistic': 7.124311073683573, + 'significant': [ True, True, True]}, - u'distribution': [ 0, + 'distribution': [ 0, 0, 0, 22, @@ -1099,18 +575,18 @@ The JSON structure for ``statisticaltest`` resources is similar to this one: 13, 0, 0], - u'negatives': 0, - u'zeros': 0}, - u'000001': { u'chi_square': { u'chi_square_value': 396.76556, - u'p_value': 0, - u'significant': [ True, + 'negatives': 0, + 'zeros': 0}, + '000001': { 'chi_square': { 'chi_square_value': 396.76556, + 'p_value': 0, + 'significant': [ True, True, True]}, - u'cho_gaines': { u'd_statistic': 7.503503138331123, - u'significant': [ True, + 'cho_gaines': { 'd_statistic': 7.503503138331123, + 'significant': [ True, True, True]}, - u'distribution': [ 0, + 'distribution': [ 0, 57, 89, 4, @@ -1119,18 +595,18 @@ The JSON structure for ``statisticaltest`` resources is similar to this one: 0, 0, 0], - u'negatives': 0, - u'zeros': 0}, - u'000002': { u'chi_square': { u'chi_square_value': 154.20728, - u'p_value': 0, - u'significant': [ True, + 'negatives': 0, + 'zeros': 0}, + '000002': { 'chi_square': { 'chi_square_value': 154.20728, + 'p_value': 0, + 'significant': [ True, True, True]}, - u'cho_gaines': { u'd_statistic': 3.9229974017266054, - u'significant': [ True, + 'cho_gaines': { 'd_statistic': 3.9229974017266054, + 'significant': [ True, True, True]}, - u'distribution': [ 50, + 'distribution': [ 50, 0, 11, 43, @@ -1139,18 +615,18 @@ The JSON structure for ``statisticaltest`` resources is similar to this one: 0, 0, 0], - u'negatives': 0, - u'zeros': 0}, - u'000003': { u'chi_square': { u'chi_square_value': 111.4438, - u'p_value': 0, - u'significant': [ True, + 'negatives': 0, + 'zeros': 0}, + '000003': { 'chi_square': { 'chi_square_value': 111.4438, + 'p_value': 0, + 'significant': [ True, True, True]}, - u'cho_gaines': { u'd_statistic': 4.103257341299901, - u'significant': [ True, + 'cho_gaines': { 'd_statistic': 4.103257341299901, + 'significant': [ True, True, True]}, - u'distribution': [ 76, + 'distribution': [ 76, 58, 7, 7, @@ -1159,71 +635,71 @@ The JSON structure for ``statisticaltest`` resources is similar to this one: 0, 0, 0], - u'negatives': 0, - u'zeros': 0}}}], - u'normality': [ { u'name': u'anderson_darling', - u'result': { u'000000': { u'p_value': 0.02252, - u'significant': [ False, + 'negatives': 0, + 'zeros': 0}}}], + 'normality': [ { 'name': 'anderson_darling', + 'result': { '000000': { 'p_value': 0.02252, + 'significant': [ False, True, True]}, - u'000001': { u'p_value': 0.02023, - u'significant': [ False, + '000001': { 'p_value': 0.02023, + 'significant': [ False, True, True]}, - u'000002': { u'p_value': 0, - u'significant': [ True, + '000002': { 'p_value': 0, + 'significant': [ True, True, True]}, - u'000003': { u'p_value': 0, - u'significant': [ True, + '000003': { 'p_value': 0, + 'significant': [ True, True, True]}}}, - { u'name': u'jarque_bera', - u'result': { u'000000': { u'p_value': 0.10615, - u'significant': [ False, + { 'name': 'jarque_bera', + 'result': { '000000': { 'p_value': 0.10615, + 'significant': [ False, False, False]}, - u'000001': { u'p_value': 0.25957, - u'significant': [ False, + '000001': { 'p_value': 0.25957, + 'significant': [ False, False, False]}, - u'000002': { u'p_value': 0.0009, - u'significant': [ True, + '000002': { 'p_value': 0.0009, + 'significant': [ True, True, True]}, - u'000003': { u'p_value': 0.00332, - u'significant': [ True, + '000003': { 'p_value': 0.00332, + 'significant': [ True, True, True]}}}, - { u'name': u'z_score', - u'result': { u'000000': { u'expected_max_z': 2.71305, - u'max_z': 2.48369}, - u'000001': { u'expected_max_z': 2.71305, - u'max_z': 3.08044}, - u'000002': { u'expected_max_z': 2.71305, - u'max_z': 1.77987}, - u'000003': { u'expected_max_z': 2.71305, - u'max_z': 1.70638}}}], - u'outliers': [ { u'name': u'grubbs', - u'result': { u'000000': { u'p_value': 1, - u'significant': [ False, + { 'name': 'z_score', + 'result': { '000000': { 'expected_max_z': 2.71305, + 'max_z': 2.48369}, + '000001': { 'expected_max_z': 2.71305, + 'max_z': 3.08044}, + '000002': { 'expected_max_z': 2.71305, + 'max_z': 1.77987}, + '000003': { 'expected_max_z': 2.71305, + 'max_z': 1.70638}}}], + 'outliers': [ { 'name': 'grubbs', + 'result': { '000000': { 'p_value': 1, + 'significant': [ False, False, False]}, - u'000001': { u'p_value': 0.26555, - u'significant': [ False, + '000001': { 'p_value': 0.26555, + 'significant': [ False, False, False]}, - u'000002': { u'p_value': 1, - u'significant': [ False, + '000002': { 'p_value': 1, + 'significant': [ False, False, False]}, - u'000003': { u'p_value': 1, - u'significant': [ False, + '000003': { 'p_value': 1, + 'significant': [ False, False, False]}}}], - u'significance_levels': [0.01, 0.05, 0.1]}, - u'updated': u'2015-07-28T18:17:11.829000', - u'white_box': False} + 'significance_levels': [0.01, 0.05, 0.1]}, + 'updated': '2015-07-28T18:17:11.829000', + 'white_box': False} Note that the output in the snippet above has been abbreviated. As you see, the ``statistical_tests`` attribute contains the ``fraud`, ``normality`` @@ -1231,10 +707,236 @@ and ``outliers`` sections where the information for each field's distribution is stored. You can check the statistical tests properties at the `API documentation -`_. +`_. + + +Supervised Models +----------------- + +Model +~~~~~ + +One of the greatest things about BigML is that the models that it +generates for you are fully white-boxed. To get the explicit tree-like +predictive model for the example above: + +.. code-block:: python + + >>> model = api.get_model(model) + >>> api.pprint(model['object']['model']['root']) + {'children': [ + {'children': [ + {'children': [{'count': 38, + 'distribution': [['Iris-virginica', 38]], + 'output': 'Iris-virginica', + 'predicate': {'field': '000002', + 'operator': '>', + 'value': 5.05}}, + 'children': [ + + [ ... ] + + {'count': 50, + 'distribution': [['Iris-setosa', 50]], + 'output': 'Iris-setosa', + 'predicate': {'field': '000002', + 'operator': '<=', + 'value': 2.45}}]}, + {'count': 150, + 'distribution': [['Iris-virginica', 50], + ['Iris-versicolor', 50], + ['Iris-setosa', 50]], + 'output': 'Iris-virginica', + 'predicate': True}]}}} + +(Note that we have abbreviated the output in the snippet above for +readability: the full predictive model yo'll get is going to contain +much more details). + +Again, filtering options are also available using a query string expression, +for instance: + +.. code-block:: python + + >>> model = api.get_model(model, "limit=5") + +limits the number of fields that will be included in ``model`` to 5. + +You can check the model properties at the `API documentation +`_. + + +Linear Regressions +~~~~~~~~~~~~~~~~~~ + +A linear regression is a supervised machine learning method for +solving regression problems by computing the objective as a linear +combination of factors. The implementation is a multiple linear regression +that models the output as a linear combination of the predictors. +The coefficients are estimated doing a least-squares fit on the training data. + +As a linear combination can only be done using numeric values, non-numeric +fields need to be transformed to numeric ones following some rules: + +- Categorical fields will be encoded and each class appearance in input data + will convey a different contribution to the input vector. +- Text and items fields will be expanded to several numeric predictors, + each one indicating the number of occurences for a specific term. + Text fields without term analysis are excluded from the model. + +Therefore, the initial input data is transformed into an input vector with one +or may components per field. Also, if a field in the training data contains +missing data, the components corresponding to that field will include an +additional 1 or 0 value depending on whether the field is missing in the +input data or not. + +The JSON structure for a linear regression is: + +.. code-block:: python + + >>> api.pprint(linear_regression["object"]) + { 'category': 0, + 'code': 200, + 'columns': 4, + 'composites': None, + 'configuration': None, + 'configuration_status': False, + 'created': '2019-02-20T21:02:40.027000', + 'creator': 'merce', + 'credits': 0.0, + 'credits_per_prediction': 0.0, + 'dataset': 'dataset/5c6dc06a983efc18e2000084', + 'dataset_field_types': { 'categorical': 0, + 'datetime': 0, + 'items': 0, + 'numeric': 6, + 'preferred': 6, + 'text': 0, + 'total': 6}, + 'dataset_status': True, + 'datasets': [], + 'default_numeric_value': None, + 'description': '', + 'excluded_fields': [], + 'execution_id': None, + 'execution_status': None, + 'fields_maps': None, + 'fields_meta': { 'count': 4, + 'limit': 1000, + 'offset': 0, + 'query_total': 4, + 'total': 4}, + 'fusions': None, + 'input_fields': ['000000', '000001', '000002'], + 'linear_regression': { 'bias': True, + 'coefficients': [ [-1.88196], + [0.475633], + [0.122468], + [30.9141]], + 'fields': { '000000': { 'column_number': 0, + 'datatype': 'int8', + 'name': 'Prefix', + 'optype': 'numeric', + 'order': 0, + 'preferred': True, + 'summary': { 'counts': [ [ 4, + 1], + + ... + 'stats': { 'confidence_intervals': [ [ 5.63628], + [ 0.375062], + [ 0.348577], + [ 44.4112]], + 'mean_squared_error': 342.206, + 'number_of_parameters': 4, + 'number_of_samples': 77, + 'p_values': [ [0.512831], + [0.0129362], + [0.491069], + [0.172471]], + 'r_squared': 0.136672, + 'standard_errors': [ [ 2.87571], + [ 0.191361], + [ 0.177849], + [ 22.6592]], + 'sum_squared_errors': 24981, + 'xtx': [ [ 4242, + 48396.9, + 51273.97, + 568], + [ 48396.9, + 570177.6584, + 594274.3274, + 6550.52], + [ 51273.97, + 594274.3274, + 635452.7068, + 6894.24], + [ 568, + 6550.52, + 6894.24, + 77]], + 'z_scores': [ [-0.654436], + [2.48552], + [0.688609], + [1.36431]]}}, + 'locale': 'en_US', + 'max_columns': 6, + 'max_rows': 80, + 'name': 'grades', + 'name_options': 'bias', + 'number_of_batchpredictions': 0, + 'number_of_evaluations': 0, + 'number_of_predictions': 2, + 'number_of_public_predictions': 0, + 'objective_field': '000005', + 'objective_field_name': 'Final', + 'objective_field_type': 'numeric', + 'objective_fields': ['000005'], + 'operating_point': { }, + 'optiml': None, + 'optiml_status': False, + 'ordering': 0, + 'out_of_bag': False, + 'out_of_bags': None, + 'price': 0.0, + 'private': True, + 'project': 'project/5c6dc062983efc18d5000129', + 'range': None, + 'ranges': None, + 'replacement': False, + 'replacements': None, + 'resource': 'linearregression/5c6dc070983efc18e00001f1', + 'rows': 80, + 'sample_rate': 1.0, + 'sample_rates': None, + 'seed': None, + 'seeds': None, + 'shared': False, + 'size': 2691, + 'source': 'source/5c6dc064983efc18e00001ed', + 'source_status': True, + 'status': { 'code': 5, + 'elapsed': 62086, + 'message': 'The linear regression has been created', + 'progress': 1}, + 'subscription': True, + 'tags': [], + 'type': 0, + 'updated': '2019-02-27T18:01:18.539000', + 'user_metadata': { }, + 'webhook': None, + 'weight_field': None, + 'white_box': False} + +Note that the output in the snippet above has been abbreviated. As you see, +the ``linear_regression`` attribute stores the coefficients used in the +linear function as well as the configuration parameters described in +the `developers section `_ . + Logistic Regressions --------------------- +~~~~~~~~~~~~~~~~~~~~ A logistic regression is a supervised machine learning method for solving classification problems. Each of the classes in the field @@ -1257,59 +959,59 @@ The JSON structure for a logistic regression is: .. code-block:: python >>> api.pprint(logistic_regression['object']) - { u'balance_objective': False, - u'category': 0, - u'code': 200, - u'columns': 5, - u'created': u'2015-10-09T16:11:08.444000', - u'credits': 0.017581939697265625, - u'credits_per_prediction': 0.0, - u'dataset': u'dataset/561304f537203f4c930001ca', - u'dataset_field_types': { u'categorical': 1, - u'datetime': 0, - u'effective_fields': 5, - u'numeric': 4, - u'preferred': 5, - u'text': 0, - u'total': 5}, - u'dataset_status': True, - u'description': u'', - u'excluded_fields': [], - u'fields_meta': { u'count': 5, - u'limit': 1000, - u'offset': 0, - u'query_total': 5, - u'total': 5}, - u'input_fields': [u'000000', u'000001', u'000002', u'000003'], - u'locale': u'en_US', - u'logistic_regression': { u'bias': 1, - u'c': 1, - u'coefficients': [ [ u'Iris-virginica', + { 'balance_objective': False, + 'category': 0, + 'code': 200, + 'columns': 5, + 'created': '2015-10-09T16:11:08.444000', + 'credits': 0.017581939697265625, + 'credits_per_prediction': 0.0, + 'dataset': 'dataset/561304f537203f4c930001ca', + 'dataset_field_types': { 'categorical': 1, + 'datetime': 0, + 'effective_fields': 5, + 'numeric': 4, + 'preferred': 5, + 'text': 0, + 'total': 5}, + 'dataset_status': True, + 'description': '', + 'excluded_fields': [], + 'fields_meta': { 'count': 5, + 'limit': 1000, + 'offset': 0, + 'query_total': 5, + 'total': 5}, + 'input_fields': ['000000', '000001', '000002', '000003'], + 'locale': 'en_US', + 'logistic_regression': { 'bias': 1, + 'c': 1, + 'coefficients': [ [ 'Iris-virginica', [ -1.7074433493289376, -1.533662474502423, 2.47026986670851, 2.5567582221085563, -1.2158200612711925]], - [ u'Iris-setosa', + [ 'Iris-setosa', [ 0.41021712519841674, 1.464162165246765, -2.26003266131107, -1.0210350909174153, 0.26421852991732514]], - [ u'Iris-versicolor', + [ 'Iris-versicolor', [ 0.42702327817072505, -1.611817241669904, 0.5763832839459982, -1.4069842681625884, 1.0946877732663143]]], - u'eps': 1e-05, - u'fields': { u'000000': { u'column_number': 0, - u'datatype': u'double', - u'name': u'sepal length', - u'optype': u'numeric', - u'order': 0, - u'preferred': True, - u'summary': { u'bins': [ [ 4.3, + 'eps': 1e-05, + 'fields': { '000000': { 'column_number': 0, + 'datatype': 'double', + 'name': 'sepal length', + 'optype': 'numeric', + 'order': 0, + 'preferred': True, + 'summary': { 'bins': [ [ 4.3, 1], [ 4.425, 4], @@ -1318,32 +1020,32 @@ The JSON structure for a logistic regression is: ... [ 7.9, 1]], - u'kurtosis': -0.57357, - u'maximum': 7.9, - u'mean': 5.84333, - u'median': 5.8, - u'minimum': 4.3, - u'missing_count': 0, - u'population': 150, - u'skewness': 0.31175, - u'splits': [ 4.51526, + 'kurtosis': -0.57357, + 'maximum': 7.9, + 'mean': 5.84333, + 'median': 5.8, + 'minimum': 4.3, + 'missing_count': 0, + 'population': 150, + 'skewness': 0.31175, + 'splits': [ 4.51526, 4.67252, 4.81113, ... 6.92597, 7.20423, 7.64746], - u'standard_deviation': 0.82807, - u'sum': 876.5, - u'sum_squares': 5223.85, - u'variance': 0.68569}}, - u'000001': { u'column_number': 1, - u'datatype': u'double', - u'name': u'sepal width', - u'optype': u'numeric', - u'order': 1, - u'preferred': True, - u'summary': { u'counts': [ [ 2, + 'standard_deviation': 0.82807, + 'sum': 876.5, + 'sum_squares': 5223.85, + 'variance': 0.68569}}, + '000001': { 'column_number': 1, + 'datatype': 'double', + 'name': 'sepal width', + 'optype': 'numeric', + 'order': 1, + 'preferred': True, + 'summary': { 'counts': [ [ 2, 1], [ 2.2, 3], @@ -1352,25 +1054,25 @@ The JSON structure for a logistic regression is: 1], [ 4.4, 1]], - u'kurtosis': 0.18098, - u'maximum': 4.4, - u'mean': 3.05733, - u'median': 3, - u'minimum': 2, - u'missing_count': 0, - u'population': 150, - u'skewness': 0.31577, - u'standard_deviation': 0.43587, - u'sum': 458.6, - u'sum_squares': 1430.4, - u'variance': 0.18998}}, - u'000002': { u'column_number': 2, - u'datatype': u'double', - u'name': u'petal length', - u'optype': u'numeric', - u'order': 2, - u'preferred': True, - u'summary': { u'bins': [ [ 1, + 'kurtosis': 0.18098, + 'maximum': 4.4, + 'mean': 3.05733, + 'median': 3, + 'minimum': 2, + 'missing_count': 0, + 'population': 150, + 'skewness': 0.31577, + 'standard_deviation': 0.43587, + 'sum': 458.6, + 'sum_squares': 1430.4, + 'variance': 0.18998}}, + '000002': { 'column_number': 2, + 'datatype': 'double', + 'name': 'petal length', + 'optype': 'numeric', + 'order': 2, + 'preferred': True, + 'summary': { 'bins': [ [ 1, 1], [ 1.16667, 3], @@ -1381,31 +1083,31 @@ The JSON structure for a logistic regression is: 2], [ 6.9, 1]], - u'kurtosis': -1.39554, - u'maximum': 6.9, - u'mean': 3.758, - u'median': 4.35, - u'minimum': 1, - u'missing_count': 0, - u'population': 150, - u'skewness': -0.27213, - u'splits': [ 1.25138, + 'kurtosis': -1.39554, + 'maximum': 6.9, + 'mean': 3.758, + 'median': 4.35, + 'minimum': 1, + 'missing_count': 0, + 'population': 150, + 'skewness': -0.27213, + 'splits': [ 1.25138, 1.32426, 1.37171, ... 6.02913, 6.38125], - u'standard_deviation': 1.7653, - u'sum': 563.7, - u'sum_squares': 2582.71, - u'variance': 3.11628}}, - u'000003': { u'column_number': 3, - u'datatype': u'double', - u'name': u'petal width', - u'optype': u'numeric', - u'order': 3, - u'preferred': True, - u'summary': { u'counts': [ [ 0.1, + 'standard_deviation': 1.7653, + 'sum': 563.7, + 'sum_squares': 2582.71, + 'variance': 3.11628}}, + '000003': { 'column_number': 3, + 'datatype': 'double', + 'name': 'petal width', + 'optype': 'numeric', + 'order': 3, + 'preferred': True, + 'summary': { 'counts': [ [ 0.1, 5], [ 0.2, 29], @@ -1414,1135 +1116,782 @@ The JSON structure for a logistic regression is: 3], [ 2.5, 3]], - u'kurtosis': -1.33607, - u'maximum': 2.5, - u'mean': 1.19933, - u'median': 1.3, - u'minimum': 0.1, - u'missing_count': 0, - u'population': 150, - u'skewness': -0.10193, - u'standard_deviation': 0.76224, - u'sum': 179.9, - u'sum_squares': 302.33, - u'variance': 0.58101}}, - u'000004': { u'column_number': 4, - u'datatype': u'string', - u'name': u'species', - u'optype': u'categorical', - u'order': 4, - u'preferred': True, - u'summary': { u'categories': [ [ u'Iris-setosa', + 'kurtosis': -1.33607, + 'maximum': 2.5, + 'mean': 1.19933, + 'median': 1.3, + 'minimum': 0.1, + 'missing_count': 0, + 'population': 150, + 'skewness': -0.10193, + 'standard_deviation': 0.76224, + 'sum': 179.9, + 'sum_squares': 302.33, + 'variance': 0.58101}}, + '000004': { 'column_number': 4, + 'datatype': 'string', + 'name': 'species', + 'optype': 'categorical', + 'order': 4, + 'preferred': True, + 'summary': { 'categories': [ [ 'Iris-setosa', 50], - [ u'Iris-versicolor', + [ 'Iris-versicolor', 50], - [ u'Iris-virginica', + [ 'Iris-virginica', 50]], - u'missing_count': 0}, - u'term_analysis': { u'enabled': True}}}, - u'normalize': False, - u'regularization': u'l2'}, - u'max_columns': 5, - u'max_rows': 150, - u'name': u"iris' dataset's logistic regression", - u'number_of_batchpredictions': 0, - u'number_of_evaluations': 0, - u'number_of_predictions': 1, - u'objective_field': u'000004', - u'objective_field_name': u'species', - u'objective_field_type': u'categorical', - u'objective_fields': [u'000004'], - u'out_of_bag': False, - u'private': True, - u'project': u'project/561304c137203f4c9300016c', - u'range': [1, 150], - u'replacement': False, - u'resource': u'logisticregression/5617e71c37203f506a000001', - u'rows': 150, - u'sample_rate': 1.0, - u'shared': False, - u'size': 4609, - u'source': u'source/561304f437203f4c930001c3', - u'source_status': True, - u'status': { u'code': 5, - u'elapsed': 86, - u'message': u'The logistic regression has been created', - u'progress': 1.0}, - u'subscription': False, - u'tags': [u'species'], - u'updated': u'2015-10-09T16:14:02.336000', - u'white_box': False} + 'missing_count': 0}, + 'term_analysis': { 'enabled': True}}}, + 'normalize': False, + 'regularization': 'l2'}, + 'max_columns': 5, + 'max_rows': 150, + 'name': u"iris' dataset's logistic regression", + 'number_of_batchpredictions': 0, + 'number_of_evaluations': 0, + 'number_of_predictions': 1, + 'objective_field': '000004', + 'objective_field_name': 'species', + 'objective_field_type': 'categorical', + 'objective_fields': ['000004'], + 'out_of_bag': False, + 'private': True, + 'project': 'project/561304c137203f4c9300016c', + 'range': [1, 150], + 'replacement': False, + 'resource': 'logisticregression/5617e71c37203f506a000001', + 'rows': 150, + 'sample_rate': 1.0, + 'shared': False, + 'size': 4609, + 'source': 'source/561304f437203f4c930001c3', + 'source_status': True, + 'status': { 'code': 5, + 'elapsed': 86, + 'message': 'The logistic regression has been created', + 'progress': 1.0}, + 'subscription': False, + 'tags': ['species'], + 'updated': '2015-10-09T16:14:02.336000', + 'white_box': False} Note that the output in the snippet above has been abbreviated. As you see, the ``logistic_regression`` attribute stores the coefficients used in the logistic function as well as the configuration parameters described in the `developers section -`_ . +`_ . +Ensembles +~~~~~~~~~ -Linear Regressions ------------------- - -A linear regression is a supervised machine learning method for -solving regression problems by computing the objective as a linear -combination of factors. The implementation is a multiple linear regression -that models the output as a linear combination of the predictors. -The coefficients are estimated doing a least-squares fit on the training data. +Ensembles are superveised machine learning models that contain several decision +tree models. In BigML, we offer different flavors or ensembles: bagging, +boosted and random decision forests. -As a linear combination can only be done using numeric values, non-numeric -fields need to be transformed to numeric ones following some rules: +The structure of an ensemble can be obtained as follows: -- Categorical fields will be encoded and each class appearance in input data - will convey a different contribution to the input vector. -- Text and items fields will be expanded to several numeric predictors, - each one indicating the number of occurences for a specific term. - Text fields without term analysis are excluded from the model. +.. code-block:: python -Therefore, the initial input data is transformed into an input vector with one -or may components per field. Also, if a field in the training data contains -missing data, the components corresponding to that field will include an -additional 1 or 0 value depending on whether the field is missing in the -input data or not. + >>> ensemble = api.get_ensemble("ensemble/5d5aea06e476842219000add") + >>> api.pprint(ensemble["object"]) + { 'boosting': None, + 'category': 0, + 'code': 200, + 'columns': 5, + 'configuration': None, + 'configuration_status': False, + 'created': '2019-08-19T18:27:18.529000', + 'creator': 'mmartin', + 'dataset': 'dataset/5d5ae9f97811dd0195009c17', + 'dataset_field_types': { 'categorical': 1, + 'datetime': 0, + 'items': 0, + 'numeric': 4, + 'preferred': 5, + 'text': 0, + 'total': 5}, + 'dataset_status': False, + 'depth_threshold': 512, + 'description': '', + 'distributions': [ { 'importance': [ ['000002', 0.72548], + ['000003', 0.24971], + ['000001', 0.02481]], + 'predictions': { 'categories': [ [ 'Iris-setosa', + 52], + [ 'Iris-versicolor', + 49], + [ 'Iris-virginica', + 49]]}, + 'training': { 'categories': [ [ 'Iris-setosa', + 52], + [ 'Iris-versicolor', + 49], + [ 'Iris-virginica', + 49]]}}, + { 'importance': [ ['000002', 0.7129], + ['000003', 0.2635], + ['000000', 0.01485], + ['000001', 0.00875]], + 'predictions': { 'categories': [ [ 'Iris-setosa', + 52], + [ 'Iris-versicolor', + 46], + [ 'Iris-virginica', + 52]]}, + 'training': { 'categories': [ [ 'Iris-setosa', + 52], + [ 'Iris-versicolor', + 46], + [ 'Iris-virginica', + 52]]}}], + 'ensemble': { 'fields': { '000000': { 'column_number': 0, + 'datatype': 'double', + 'name': 'sepal length', + 'optype': 'numeric', + 'order': 0, + 'preferred': True, + 'summary': + ... + 'missing_count': 0}, + 'term_analysis': { 'enabled': True}}}}, + 'ensemble_sample': { 'rate': 1, + 'replacement': True, + 'seed': '820c4aa0a34a4fb69392476c6ffc38dc'}, + 'error_models': 0, + 'fields_meta': { 'count': 5, + 'limit': 1000, + 'offset': 0, + 'query_total': 5, + 'total': 5}, + 'finished_models': 2, + 'focus_field': None, + 'focus_field_name': None, + 'fusions': ['fusion/6488ab197411b45de19f1e19'], + 'importance': { '000000': 0.00743, + '000001': 0.01678, + '000002': 0.71919, + '000003': 0.2566}, + 'input_fields': ['000000', '000001', '000002', '000003'], + 'locale': 'en_US', + 'max_columns': 5, + 'max_rows': 150, + 'missing_splits': False, + 'models': [ 'model/5d5aea073514cd6bf200a630', + 'model/5d5aea083514cd6bf200a632'], + 'name': 'iris', + 'name_options': 'bootstrap decision forest, 512-node, 2-model, pruned, ' + 'deterministic order', + 'node_threshold': 512, + 'number_of_batchpredictions': 0, + 'number_of_evaluations': 0, + 'number_of_models': 2, + 'number_of_predictions': 0, + 'number_of_public_predictions': 0, + 'objective_field': '000004', + 'objective_field_details': { 'column_number': 4, + 'datatype': 'string', + 'name': 'species', + 'optype': 'categorical', + 'order': 4}, + 'objective_field_name': 'species', + 'objective_field_type': 'categorical', + 'objective_fields': ['000004'], + 'optiml': None, + 'optiml_status': False, + 'ordering': 0, + 'out_of_bag': False, + 'price': 0.0, + 'private': True, + 'project': None, + 'randomize': False, + 'range': None, + 'replacement': False, + 'resource': 'ensemble/5d5aea06e476842219000add', + 'rows': 150, + 'sample_rate': 1.0, + 'selective_pruning': True, + 'shared': True, + 'shared_clonable': True, + 'shared_hash': 'qfCR2ezORt5u8GNyGaTtJqwJemh', + 'sharing_key': '125380a1560a8efdc0e3eedee7bd2ccce1c4936c', + 'size': 4608, + 'source': 'source/5d5ae9f7e47684769e001337', + 'source_status': False, + 'split_candidates': 32, + 'split_field': None, + 'split_field_name': None, + 'stat_pruning': True, + 'status': { 'code': 5, + 'elapsed': 804, + 'message': 'The ensemble has been created', + 'progress': 1}, + 'subscription': False, + 'support_threshold': 0.0, + 'tags': [], + 'type': 0, + 'updated': '2023-06-13T17:44:57.780000', + 'white_box': False} -The JSON structure for a linear regression is: +Note that the output in the snippet above has been abbreviated. As you see, +the ``number_of_models`` attribute stores number of decision trees used in the +ensemble and the rest of the dictionary contains the configuration parameters described in the `developers section +`_ . -.. code-block:: python +Deepnets +~~~~~~~~ - >>> api.pprint(linear_regression["object"]) - { u'category': 0, - u'code': 200, - u'columns': 4, - u'composites': None, - u'configuration': None, - u'configuration_status': False, - u'created': u'2019-02-20T21:02:40.027000', - u'creator': u'merce', - u'credits': 0.0, - u'credits_per_prediction': 0.0, - u'dataset': u'dataset/5c6dc06a983efc18e2000084', - u'dataset_field_types': { u'categorical': 0, - u'datetime': 0, - u'items': 0, - u'numeric': 6, - u'preferred': 6, - u'text': 0, - u'total': 6}, - u'dataset_status': True, - u'datasets': [], - u'default_numeric_value': None, - u'description': u'', - u'excluded_fields': [], - u'execution_id': None, - u'execution_status': None, - u'fields_maps': None, - u'fields_meta': { u'count': 4, - u'limit': 1000, - u'offset': 0, - u'query_total': 4, - u'total': 4}, - u'fusions': None, - u'input_fields': [u'000000', u'000001', u'000002'], - u'linear_regression': { u'bias': True, - u'coefficients': [ [-1.88196], - [0.475633], - [0.122468], - [30.9141]], - u'fields': { u'000000': { u'column_number': 0, - u'datatype': u'int8', - u'name': u'Prefix', - u'optype': u'numeric', - u'order': 0, - u'preferred': True, - u'summary': { u'counts': [ [ 4, - 1], +Ensembles are superveised machine learning models that contain several decision +tree models. In BigML, we offer different flavors or ensembles: bagging, +boosted and random decision forests. - ... - u'stats': { u'confidence_intervals': [ [ 5.63628], - [ 0.375062], - [ 0.348577], - [ 44.4112]], - u'mean_squared_error': 342.206, - u'number_of_parameters': 4, - u'number_of_samples': 77, - u'p_values': [ [0.512831], - [0.0129362], - [0.491069], - [0.172471]], - u'r_squared': 0.136672, - u'standard_errors': [ [ 2.87571], - [ 0.191361], - [ 0.177849], - [ 22.6592]], - u'sum_squared_errors': 24981, - u'xtx': [ [ 4242, - 48396.9, - 51273.97, - 568], - [ 48396.9, - 570177.6584, - 594274.3274, - 6550.52], - [ 51273.97, - 594274.3274, - 635452.7068, - 6894.24], - [ 568, - 6550.52, - 6894.24, - 77]], - u'z_scores': [ [-0.654436], - [2.48552], - [0.688609], - [1.36431]]}}, - u'locale': u'en_US', - u'max_columns': 6, - u'max_rows': 80, - u'name': u'grades', - u'name_options': u'bias', - u'number_of_batchpredictions': 0, - u'number_of_evaluations': 0, - u'number_of_predictions': 2, - u'number_of_public_predictions': 0, - u'objective_field': u'000005', - u'objective_field_name': u'Final', - u'objective_field_type': u'numeric', - u'objective_fields': [u'000005'], - u'operating_point': { }, - u'optiml': None, - u'optiml_status': False, - u'ordering': 0, - u'out_of_bag': False, - u'out_of_bags': None, - u'price': 0.0, - u'private': True, - u'project': u'project/5c6dc062983efc18d5000129', - u'range': None, - u'ranges': None, - u'replacement': False, - u'replacements': None, - u'resource': u'linearregression/5c6dc070983efc18e00001f1', - u'rows': 80, - u'sample_rate': 1.0, - u'sample_rates': None, - u'seed': None, - u'seeds': None, - u'shared': False, - u'size': 2691, - u'source': u'source/5c6dc064983efc18e00001ed', - u'source_status': True, - u'status': { u'code': 5, - u'elapsed': 62086, - u'message': u'The linear regression has been created', - u'progress': 1}, - u'subscription': True, - u'tags': [], - u'type': 0, - u'updated': u'2019-02-27T18:01:18.539000', - u'user_metadata': { }, - u'webhook': None, - u'weight_field': None, - u'white_box': False} +The structure of an ensemble can be obtained as follows: -Note that the output in the snippet above has been abbreviated. As you see, -the ``linear_regression`` attribute stores the coefficients used in the -linear function as well as the configuration parameters described in -the `developers section `_ . +.. code-block:: python + >>> deepnet = api.get_deepnet("deepnet/64f2193379c602359ec90197") + >>> api.pprint(deepnet["object"]) + { 'category': 0, + 'code': 200, + 'columns': 11, + 'configuration': None, + 'configuration_status': False, + 'created': '2023-09-01T17:02:43.222000', + 'creator': 'mmartin', + 'dataset': 'dataset/64f2192251595a5d90394c1e', + 'dataset_field_types': { 'categorical': 1, + 'datetime': 1, + 'image': 0, + 'items': 0, + 'numeric': 9, + 'path': 0, + 'preferred': 10, + 'regions': 0, + 'text': 0, + 'total': 11}, + 'dataset_status': True, + 'deepnet': { 'batch_normalization': False, + 'deepnet_seed': 'bigml', + 'deepnet_version': 'alpha', + 'dropout_rate': 0.0, + 'fields': { '000000': { 'column_number': 0, + 'datatype': 'string', + 'name': 'cat-0', + 'optype': 'categorical', + 'order': 0, + 'preferred': True, + 'summary': { + ... + 1954.26254, + 'variance': 0.9737}}}, + 'hidden_layers': [ { 'activation_function': 'tanh', + 'number_of_nodes': 64, + 'offset': 'zeros', + 'seed': 0, + 'type': 'dense', + 'weights': 'glorot_uniform'}], + 'holdout_metrics': { 'mean_absolute_error': 0.8178046941757202, + 'mean_squared_error': 1.0125617980957031, + 'median_absolute_error': 0.6850314736366272, + 'r_squared': -0.009405492794412496, + 'spearman_r': 0.07955370033562714}, + 'learn_residuals': False, + 'learning_rate': 0.01, + 'max_iterations': 100, + 'missing_numerics': True, + 'network': { 'image_network': None, + 'layers': [ { 'activation_function': 'tanh', + 'mean': None, + 'number_of_nodes': 64, + 'offset': [ -0.01426, + 0.06489, + 0.00609, + ... + -0.06769, + 0.2289, + 0.03777]]}], + 'output_exposition': { 'mean': -0.06256, + 'stdev': 0.98676, + 'type': 'numeric'}, + 'preprocess': [ { 'index': 0, + 'type': 'categorical', + 'values': [ 'cat0', + 'cat1', + 'cat2']}, + { 'index': 1, + 'mean': 1974.3085, + 'stdev': 43.39534, + 'type': 'numeric'}, + { 'index': 2, + 'mean': 6.459, + 'stdev': 3.4764, + 'type': 'numeric'}, + { 'index': 3, + 'mean': 15.537, + 'stdev': 8.7924, + 'type': 'numeric'}, + { 'index': 4, + 'mean': 4.0015, + 'stdev': 2.02893, + 'type': 'numeric'}, + { 'index': 5, + 'mean': 11.8105, + 'stdev': 6.84646, + 'type': 'numeric'}, + { 'index': 6, + 'mean': 29.3555, + 'stdev': 17.3928, + 'type': 'numeric'}, + { 'index': 7, + 'mean': 29.715, + 'stdev': 17.14149, + 'type': 'numeric'}, + { 'index': 8, + 'mean': 501.6185, + 'stdev': 292.27451, + 'type': 'numeric'}], + 'trees': None}, + 'network_structure': { 'image_network': None, + 'layers': [ { 'activation_function': 'tanh', + 'mean': None, + 'number_of_nodes': 64, + 'offset': 'zeros', + 'residuals': False, + 'scale': None, + 'stdev': None, + 'weights': 'glorot_uniform'}, + { 'activation_function': 'linear', + 'mean': None, + 'number_of_nodes': 1, + 'offset': 'zeros', + 'residuals': False, + 'scale': None, + 'stdev': None, + 'weights': 'glorot_uniform'}], + 'output_exposition': { 'mean': -0.06256, + 'stdev': 0.98676, + 'type': 'numeric'}, + 'preprocess': [ { 'index': 0, + 'type': 'categorical', + 'values': [ 'cat0', + 'cat1', + 'cat2']}, + { 'index': 1, + 'mean': 1974.3085, + 'stdev': 43.39534, + 'type': 'numeric'}, + { 'index': 2, + 'mean': 6.459, + 'stdev': 3.4764, + 'type': 'numeric'}, + { 'index': 3, + 'mean': 15.537, + 'stdev': 8.7924, + 'type': 'numeric'}, + { 'index': 4, + 'mean': 4.0015, + 'stdev': 2.02893, + 'type': 'numeric'}, + { 'index': 5, + 'mean': 11.8105, + 'stdev': 6.84646, + 'type': 'numeric'}, + { 'index': 6, + 'mean': 29.3555, + 'stdev': 17.3928, + 'type': 'numeric'}, + { 'index': 7, + 'mean': 29.715, + 'stdev': 17.14149, + 'type': 'numeric'}, + { 'index': 8, + 'mean': 501.6185, + 'stdev': 292.27451, + 'type': 'numeric'}], + 'trees': None}, + 'number_of_hidden_layers': 1, + 'number_of_iterations': 100, + 'optimizer': { 'adam': { 'beta1': 0.9, + 'beta2': 0.999, + 'epsilon': 1e-08}}, + 'search': False, + 'suggest_structure': False, + 'tree_embedding': False}, + 'description': '', + 'excluded_fields': [], + 'fields_meta': { 'count': 11, + 'limit': 1000, + 'offset': 0, + 'query_total': 11, + 'total': 11}, + 'importance': { '000000': 0.12331, + '000001-0': 0.25597, + '000001-1': 0.07716, + '000001-2': 0.15659, + '000001-3': 0.11564, + '000001-4': 0.0644, + '000001-5': 0.09814, + '000001-6': 0.0555, + '000001-7': 0.05329}, + 'input_fields': [ '000000', + '000001-0', + '000001-1', + '000001-2', + '000001-3', + '000001-4', + '000001-5', + '000001-6', + '000001-7'], + 'locale': 'en_US', + 'max_columns': 11, + 'max_rows': 2000, + 'name': 'dates2', + 'name_options': '1 hidden layers, adam, learning rate=0.01, 100-iteration, ' + 'beta1=0.9, beta2=0.999, epsilon=1e-08, missing values', + 'number_of_batchpredictions': 0, + 'number_of_evaluations': 0, + 'number_of_predictions': 0, + 'number_of_public_predictions': 0, + 'objective_field': '000002', + 'objective_field_name': 'target-2', + 'objective_field_type': 'numeric', + 'objective_fields': ['000002'], + 'optiml': None, + 'optiml_status': False, + 'ordering': 0, + 'out_of_bag': False, + 'price': 0.0, + 'private': True, + 'project': 'project/64f2191c4a1a2c29a1084943', + 'range': None, + 'regression_weight_ratio': None, + 'replacement': False, + 'resource': 'deepnet/64f2193379c602359ec90197', + 'rows': 2000, + 'sample_rate': 1.0, + 'shared': False, + 'size': 96976, + 'source': 'source/64f2191f51595a5d8cbf7883', + 'source_status': True, + 'status': { 'code': 5, + 'elapsed': 10013, + 'message': 'The deepnet has been created', + 'progress': 1.0}, + 'subscription': False, + 'tags': [], + 'type': 0, + 'updated': '2023-09-01T17:11:28.762000', + 'white_box': False} -Associations ------------- -Association Discovery is a popular method to find out relations among values -in high-dimensional datasets. +Note that the output in the snippet above has been abbreviated. As you see, +the ``network`` attribute stores the coefficients used in the +neural network structure and the rest of the dictionary shows the +configuration parameters described in the `developers section +`_ . -A common case where association discovery is often used is -market basket analysis. This analysis seeks for customer shopping -patterns across large transactional -datasets. For instance, do customers who buy hamburgers and ketchup also -consume bread? +OptiMLs +~~~~~~~ -Businesses use those insights to make decisions on promotions and product -placements. -Association Discovery can also be used for other purposes such as early -incident detection, web usage analysis, or software intrusion detection. +An OptiML is the result of an automated optimization process to find the +best model (type and configuration) to solve a particular +classification or regression problem. -In BigML, the Association resource object can be built from any dataset, and -its results are a list of association rules between the items in the dataset. -In the example case, the corresponding -association rule would have hamburguers and ketchup as the items at the -left hand side of the association rule and bread would be the item at the -right hand side. Both sides in this association rule are related, -in the sense that observing -the items in the left hand side implies observing the items in the right hand -side. There are some metrics to ponder the quality of these association rules: +The selection process automates the usual time-consuming task of trying +different models and parameters and evaluating their results to find the +best one. Using the OptiML, non-experts can build top-performing models. -- Support: the proportion of instances which contain an itemset. +You can create an OptiML selecting the ojective field to be predicted, the +evaluation metric to be used to rank the models tested in the process and +a maximum time for the task to be run. -For an association rule, it means the number of instances in the dataset which -contain the rule's antecedent and rule's consequent together -over the total number of instances (N) in the dataset. +The JSON structure for an OptiML is: -It gives a measure of the importance of the rule. Association rules have -to satisfy a minimum support constraint (i.e., min_support). +.. code-block:: python -- Coverage: the support of the antedecent of an association rule. -It measures how often a rule can be applied. + >>> api.pprint(optiml["object"]) + { 'category': 0, + 'code': 200, + 'configuration': None, + 'configuration_status': False, + 'created': '2018-05-17T20:23:00.060000', + 'creator': 'mmartin', + 'dataset': 'dataset/5afdb7009252732d930009e8', + 'dataset_status': True, + 'datasets': [ 'dataset/5afde6488bf7d551ee00081c', + 'dataset/5afde6488bf7d551fd00511f', + 'dataset/5afde6488bf7d551fe002e0f', + ... + 'dataset/5afde64d8bf7d551fd00512e'], + 'description': '', + 'evaluations': [ 'evaluation/5afde65c8bf7d551fd00514c', + 'evaluation/5afde65c8bf7d551fd00514f', + ... + 'evaluation/5afde6628bf7d551fd005161'], + 'excluded_fields': [], + 'fields_meta': { 'count': 5, + 'limit': 1000, + 'offset': 0, + 'query_total': 5, + 'total': 5}, + 'input_fields': ['000000', '000001', '000002', '000003'], + 'model_count': { 'logisticregression': 1, 'model': 8, 'total': 9}, + 'models': [ 'model/5afde64e8bf7d551fd005131', + 'model/5afde64f8bf7d551fd005134', + 'model/5afde6518bf7d551fd005137', + 'model/5afde6538bf7d551fd00513a', + 'logisticregression/5afde6558bf7d551fd00513d', + ... + 'model/5afde65a8bf7d551fd005149'], + 'models_meta': { 'count': 9, 'limit': 1000, 'offset': 0, 'total': 9}, + 'name': 'iris', + 'name_options': '9 total models (logisticregression: 1, model: 8), metric=max_phi, model candidates=18, max. training time=300', + 'objective_field': '000004', + 'objective_field_details': { 'column_number': 4, + 'datatype': 'string', + 'name': 'species', + 'optype': 'categorical', + 'order': 4}, + 'objective_field_name': 'species', + 'objective_field_type': 'categorical', + 'objective_fields': ['000004'], + 'optiml': { 'created_resources': { 'dataset': 10, + 'logisticregression': 11, + 'logisticregression_evaluation': 11, + 'model': 29, + 'model_evaluation': 29}, + 'datasets': [ { 'id': 'dataset/5afde6488bf7d551ee00081c', + 'name': 'iris', + 'name_options': '120 instances, 5 fields (1 categorical, 4 numeric), sample rate=0.8'}, + { 'id': 'dataset/5afde6488bf7d551fd00511f', + 'name': 'iris', + 'name_options': '30 instances, 5 fields (1 categorical, 4 numeric), sample rate=0.2, out of bag'}, + { 'id': 'dataset/5afde6488bf7d551fe002e0f', + 'name': 'iris', + 'name_options': '120 instances, 5 fields (1 categorical, 4 numeric), sample rate=0.8'}, + ... + { 'id': 'dataset/5afde64d8bf7d551fd00512e', + 'name': 'iris', + 'name_options': '120 instances, 5 fields (1 categorical, 4 numeric), sample rate=0.8'}], + 'fields': { '000000': { 'column_number': 0, + 'datatype': 'double', + 'name': 'sepal length', + 'optype': 'numeric', + 'order': 0, + 'preferred': True, + 'summary': { 'bins': [ [ 4.3, + 1], + ... + [ 7.9, + 1]], + ... + 'sum': 179.9, + 'sum_squares': 302.33, + 'variance': 0.58101}}, + '000004': { 'column_number': 4, + 'datatype': 'string', + 'name': 'species', + 'optype': 'categorical', + 'order': 4, + 'preferred': True, + 'summary': { 'categories': [ [ 'Iris-setosa', + 50], + [ 'Iris-versicolor', + 50], + [ 'Iris-virginica', + 50]], + 'missing_count': 0}, + 'term_analysis': { 'enabled': True}}}, + 'max_training_time': 300, + 'metric': 'max_phi', + 'model_types': ['model', 'logisticregression'], + 'models': [ { 'evaluation': { 'id': 'evaluation/5afde65c8bf7d551fd00514c', + 'info': { 'accuracy': 0.96667, + 'average_area_under_pr_curve': 0.97867, + ... + 'per_class_statistics': [ { 'accuracy': 1, + 'area_under_pr_curve': 1, + ... + 'spearmans_rho': 0.82005}]}, + 'metric_value': 0.95356, + 'metric_variance': 0.00079, + 'name': 'iris vs. iris', + 'name_options': '279-node, deterministic order, operating kind=probability'}, + 'evaluation_count': 3, + 'id': 'model/5afde64e8bf7d551fd005131', + 'importance': [ [ '000002', + 0.70997], + [ '000003', + 0.27289], + [ '000000', + 0.0106], + [ '000001', + 0.00654]], + 'kind': 'model', + 'name': 'iris', + 'name_options': '279-node, deterministic order'}, + { 'evaluation': { 'id': 'evaluation/5afde65c8bf7d551fd00514f', + 'info': { 'accuracy': 0.93333, -- Confidence or (strength): The probability of seeing the rule's consequent -under the condition that the instances also contain the rule's antecedent. -Confidence is computed using the support of the association rule over the -coverage. That is, the percentage of instances which contain the consequent -and antecedent together over the number of instances which only contain -the antecedent. + ... + [ '000001', + 0.02133]], + 'kind': 'model', + 'name': 'iris', + 'name_options': '12-node, randomize, deterministic order, balanced'}], + 'number_of_model_candidates': 18, + 'recent_evaluations': [ 0.90764, + 0.94952, + ... + 0.90427], + 'search_complete': True, + 'summary': { 'logisticregression': { 'best': 'logisticregression/5afde6558bf7d551fd00513d', + 'count': 1}, + 'model': { 'best': 'model/5afde64e8bf7d551fd005131', + 'count': 8}}}, + 'private': True, + 'project': None, + 'resource': 'optiml/5afde4a42a83475c1b0008a2', + 'shared': False, + 'size': 3686, + 'source': 'source/5afdb6fb9252732d930009e5', + 'source_status': True, + 'status': { 'code': 5, + 'elapsed': 448878.0, + 'message': 'The optiml has been created', + 'progress': 1}, + 'subscription': False, + 'tags': [], + 'test_dataset': None, + 'type': 0, + 'updated': '2018-05-17T20:30:29.063000'} -Confidence is directed and gives different values for the association -rules Antecedent → Consequent and Consequent → Antecedent. Association -rules also need to satisfy a minimum confidence constraint -(i.e., min_confidence). -- Leverage: the difference of the support of the association -rule (i.e., the antecedent and consequent appearing together) and what would -be expected if antecedent and consequent where statistically independent. -This is a value between -1 and 1. A positive value suggests a positive -relationship and a negative value suggests a negative relationship. -0 indicates independence. +You can check the optiml properties at the `API documentation +`_. -Lift: how many times more often antecedent and consequent occur together -than expected if they where statistically independent. -A value of 1 suggests that there is no relationship between the antecedent -and the consequent. Higher values suggest stronger positive relationships. -Lower values suggest stronger negative relationships (the presence of the -antecedent reduces the likelihood of the consequent) -As to the items used in association rules, each type of field is parsed to -extract items for the rules as follows: +Fusions +~~~~~~~ -- Categorical: each different value (class) will be considered a separate item. -- Text: each unique term will be considered a separate item. -- Items: each different item in the items summary will be considered. -- Numeric: Values will be converted into categorical by making a -segmentation of the values. -For example, a numeric field with values ranging from 0 to 600 split -into 3 segments: -segment 1 → [0, 200), segment 2 → [200, 400), segment 3 → [400, 600]. -You can refine the behavior of the transformation using -`discretization `_ -and `field_discretizations `_. +A Fusion is a special type of composed resource for which all +submodels satisfy the following constraints: they're all either +classifications or regressions over the same kind of data or +compatible fields, with the same objective field. Given those +properties, a fusion can be considered a supervised model, +and therefore one can predict with fusions and evaluate them. +Ensembles can be viewed as a kind of fusion subject to the additional +constraints that all its submodels are tree models that, moreover, +have been built from the same base input data, but sampled in particular ways. -The JSON structure for an association resource is: +The model types allowed to be a submodel of a fusion are: +deepnet, ensemble, fusion, model, logistic regression and linear regression. -.. code-block:: python +The JSON structure for an Fusion is: +.. code-block:: python - >>> api.pprint(association['object']) + >>> api.pprint(fusion["object"]) { - "associations":{ - "complement":false, - "discretization":{ - "pretty":true, - "size":5, - "trim":0, - "type":"width" - }, - "items":[ + "category": 0, + "code": 200, + "configuration": null, + "configuration_status": false, + "created": "2018-05-09T20:11:05.821000", + "credits_per_prediction": 0, + "description": "", + "fields_meta": { + "count": 5, + "limit": 1000, + "offset": 0, + "query_total": 5, + "total": 5 + }, + "fusion": { + "models": [ { - "complement":false, - "count":32, - "field_id":"000000", - "name":"Segment 1", - "bin_end":5, - "bin_start":null - }, - { - "complement":false, - "count":49, - "field_id":"000000", - "name":"Segment 3", - "bin_end":7, - "bin_start":6 - }, - { - "complement":false, - "count":12, - "field_id":"000000", - "name":"Segment 4", - "bin_end":null, - "bin_start":7 - }, - { - "complement":false, - "count":19, - "field_id":"000001", - "name":"Segment 1", - "bin_end":2.5, - "bin_start":null - }, - ... - { - "complement":false, - "count":50, - "field_id":"000004", - "name":"Iris-versicolor" - }, - { - "complement":false, - "count":50, - "field_id":"000004", - "name":"Iris-virginica" - } - ], - "max_k": 100, - "min_confidence":0, - "min_leverage":0, - "min_lift":1, - "min_support":0, - "rules":[ - { - "confidence":1, - "id":"000000", - "leverage":0.22222, - "lhs":[ - 13 - ], - "lhs_cover":[ - 0.33333, - 50 - ], - "lift":3, - "p_value":0.000000000, - "rhs":[ - 6 - ], - "rhs_cover":[ - 0.33333, - 50 - ], - "support":[ - 0.33333, - 50 - ] - }, - { - "confidence":1, - "id":"000001", - "leverage":0.22222, - "lhs":[ - 6 - ], - "lhs_cover":[ - 0.33333, - 50 - ], - "lift":3, - "p_value":0.000000000, - "rhs":[ - 13 - ], - "rhs_cover":[ - 0.33333, - 50 - ], - "support":[ - 0.33333, - 50 - ] - }, - ... - { - "confidence":0.26, - "id":"000029", - "leverage":0.05111, - "lhs":[ - 13 - ], - "lhs_cover":[ - 0.33333, - 50 - ], - "lift":2.4375, - "p_value":0.0000454342, - "rhs":[ - 5 - ], - "rhs_cover":[ - 0.10667, - 16 - ], - "support":[ - 0.08667, - 13 - ] + "id": "ensemble/5af272eb4e1727d378000050", + "kind": "ensemble", + "name": "Iris ensemble", + "name_options": "boosted trees, 1999-node, 16-iteration, deterministic order, balanced" }, { - "confidence":0.18, - "id":"00002a", - "leverage":0.04, - "lhs":[ - 15 - ], - "lhs_cover":[ - 0.33333, - 50 - ], - "lift":3, - "p_value":0.0000302052, - "rhs":[ - 9 - ], - "rhs_cover":[ - 0.06, - 9 - ], - "support":[ - 0.06, - 9 - ] + "id": "model/5af272fe4e1727d3780000d6", + "kind": "model", + "name": "Iris model", + "name_options": "1999-node, pruned, deterministic order, balanced" }, { - "confidence":1, - "id":"00002b", - "leverage":0.04, - "lhs":[ - 9 - ], - "lhs_cover":[ - 0.06, - 9 - ], - "lift":3, - "p_value":0.0000302052, - "rhs":[ - 15 - ], - "rhs_cover":[ - 0.33333, - 50 - ], - "support":[ - 0.06, - 9 - ] + "id": "logisticregression/5af272ff4e1727d3780000d9", + "kind": "logisticregression", + "name": "Iris LR", + "name_options": "L2 regularized (c=1), bias, auto-scaled, missing values, eps=0.001" } - ], - "rules_summary":{ - "confidence":{ - "counts":[ - [ - 0.18, - 1 - ], - [ - 0.24, - 1 - ], - [ - 0.26, - 2 - ], - ... - [ - 0.97959, - 1 - ], - [ - 1, - 9 - ] - ], - "maximum":1, - "mean":0.70986, - "median":0.72864, - "minimum":0.18, - "population":44, - "standard_deviation":0.24324, - "sum":31.23367, - "sum_squares":24.71548, - "variance":0.05916 - }, - "k":44, - "leverage":{ - "counts":[ - [ - 0.04, - 2 - ], - [ - 0.05111, - 4 - ], - [ - 0.05316, - 2 - ], - ... - [ - 0.22222, - 2 - ] - ], - "maximum":0.22222, - "mean":0.10603, - "median":0.10156, - "minimum":0.04, - "population":44, - "standard_deviation":0.0536, - "sum":4.6651, - "sum_squares":0.61815, - "variance":0.00287 - }, - "lhs_cover":{ - "counts":[ - [ - 0.06, - 2 - ], - [ - 0.08, - 2 - ], - [ - 0.10667, - 4 - ], - [ - 0.12667, - 1 - ], - ... - [ - 0.5, - 4 - ] - ], - "maximum":0.5, - "mean":0.29894, - "median":0.33213, - "minimum":0.06, - "population":44, - "standard_deviation":0.13386, - "sum":13.15331, - "sum_squares":4.70252, - "variance":0.01792 - }, - "lift":{ - "counts":[ - [ - 1.40625, - 2 - ], - [ - 1.5067, - 2 - ], - ... - [ - 2.63158, - 4 - ], - [ - 3, - 10 - ], - [ - 4.93421, - 2 - ], - [ - 12.5, - 2 - ] - ], - "maximum":12.5, - "mean":2.91963, - "median":2.58068, - "minimum":1.40625, - "population":44, - "standard_deviation":2.24641, - "sum":128.46352, - "sum_squares":592.05855, - "variance":5.04635 - }, - "p_value":{ - "counts":[ - [ - 0.000000000, - 2 - ], - [ - 0.000000000, - 4 - ], - [ - 0.000000000, - 2 - ], - ... - [ - 0.0000910873, - 2 - ] - ], - "maximum":0.0000910873, - "mean":0.0000106114, - "median":0.00000000, - "minimum":0.000000000, - "population":44, - "standard_deviation":0.0000227364, - "sum":0.000466903, - "sum_squares":0.0000000, - "variance":0.000000001 - }, - "rhs_cover":{ - "counts":[ - [ - 0.06, - 2 - ], - [ - 0.08, - 2 - ], - ... - [ - 0.42667, - 2 - ], - [ - 0.46667, - 3 - ], - [ - 0.5, - 4 - ] - ], - "maximum":0.5, - "mean":0.29894, - "median":0.33213, - "minimum":0.06, - "population":44, - "standard_deviation":0.13386, - "sum":13.15331, - "sum_squares":4.70252, - "variance":0.01792 - }, - "support":{ - "counts":[ - [ - 0.06, - 4 - ], - [ - 0.06667, - 2 - ], - [ - 0.08, - 2 - ], - [ - 0.08667, - 4 - ], - [ - 0.10667, - 4 - ], - [ - 0.15333, - 2 - ], - [ - 0.18667, - 4 - ], - [ - 0.19333, - 2 - ], - [ - 0.20667, - 2 - ], - [ - 0.27333, - 2 - ], - [ - 0.28667, - 2 - ], - [ - 0.3, - 4 - ], - [ - 0.32, - 2 - ], - [ - 0.33333, - 6 - ], - [ - 0.37333, - 2 - ] - ], - "maximum":0.37333, - "mean":0.20152, - "median":0.19057, - "minimum":0.06, - "population":44, - "standard_deviation":0.10734, - "sum":8.86668, - "sum_squares":2.28221, - "variance":0.01152 - } - }, - "search_strategy":"leverage", - "significance_level":0.05 + ] }, - "category":0, - "clones":0, - "code":200, - "columns":5, - "created":"2015-11-05T08:06:08.184000", - "credits":0.017581939697265625, - "dataset":"dataset/562fae3f4e1727141d00004e", - "dataset_status":true, - "dataset_type":0, - "description":"", - "excluded_fields":[ ], - "fields_meta":{ - "count":5, - "limit":1000, - "offset":0, - "query_total":5, - "total":5 + "importance": { + "000000": 0.05847, + "000001": 0.03028, + "000002": 0.13582, + "000003": 0.4421 }, - "input_fields":[ - "000000", - "000001", - "000002", - "000003", - "000004" + "model_count": { + "ensemble": 1, + "logisticregression": 1, + "model": 1, + "total": 3 + }, + "models": [ + "ensemble/5af272eb4e1727d378000050", + "model/5af272fe4e1727d3780000d6", + "logisticregression/5af272ff4e1727d3780000d9" ], - "locale":"en_US", - "max_columns":5, - "max_rows":150, - "name":"iris' dataset's association", - "out_of_bag":false, - "price":0, - "private":true, - "project":null, - "range":[ - 1, - 150 + "models_meta": { + "count": 3, + "limit": 1000, + "offset": 0, + "total": 3 + }, + "name": "iris", + "name_options": "3 total models (ensemble: 1, logisticregression: 1, model: 1)", + "number_of_batchpredictions": 0, + "number_of_evaluations": 0, + "number_of_predictions": 0, + "number_of_public_predictions": 0, + "objective_field": "000004", + "objective_field_details": { + "column_number": 4, + "datatype": "string", + "name": "species", + "optype": "categorical", + "order": 4 + }, + "objective_field_name": "species", + "objective_field_type": "categorical", + "objective_fields": [ + "000004" ], - "replacement":false, - "resource":"association/5621b70910cb86ae4c000000", - "rows":150, - "sample_rate":1, - "shared":false, - "size":4609, - "source":"source/562fae3a4e1727141d000048", - "source_status":true, - "status":{ - "code":5, - "elapsed":1072, - "message":"The association has been created", - "progress":1 + "private": true, + "project": null, + "resource":"fusion/59af8107b8aa0965d5b61138", + "shared": false, + "status": { + "code": 5, + "elapsed": 8420, + "message": "The fusion has been created", + "progress": 1 }, - "subscription":false, - "tags":[ ], - "updated":"2015-11-05T08:06:20.403000", - "white_box":false + "subscription": false, + "tags": [], + "type": 0, + "updated": "2018-05-09T20:11:14.258000" } -Note that the output in the snippet above has been abbreviated. As you see, -the ``associations`` attribute stores items, rules and metrics extracted -from the datasets as well as the configuration parameters described in -the `developers section `_ . - - -Topic Models ------------- - -A topic model is an unsupervised machine learning method -for unveiling all the different topics -underlying a collection of documents. -BigML uses Latent Dirichlet Allocation (LDA), one of the most popular -probabilistic methods for topic modeling. -In BigML, each instance (i.e. each row in your dataset) will -be considered a document and the contents of all the text fields -given as inputs will be automatically concatenated and considered the -document bag of words. - -Topic model is based on the assumption that any document -exhibits a mixture of topics. Each topic is composed of a set of words -which are thematically related. The words from a given topic have different -probabilities for that topic. At the same time, each word can be attributable -to one or several topics. So for example the word "sea" may be found in -a topic related with sea transport but also in a topic related to holidays. -Topic model automatically discards stop words and high -frequency words. - -Topic model's main applications include browsing, organizing and understanding -large archives of documents. It can been applied for information retrieval, -collaborative filtering, assessing document similarity among others. -The topics found in the dataset can also be very useful new features -before applying other models like classification, clustering, or -anomaly detection. - -The JSON structure for a topic model is: - -.. code-block:: python - - >>> api.pprint(topic['object']) - { u'category': 0, - u'code': 200, - u'columns': 1, - u'configuration': None, - u'configuration_status': False, - u'created': u'2016-11-23T23:47:54.703000', - u'credits': 0.0, - u'credits_per_prediction': 0.0, - u'dataset': u'dataset/58362aa0983efc45a0000005', - u'dataset_field_types': { u'categorical': 1, - u'datetime': 0, - u'effective_fields': 672, - u'items': 0, - u'numeric': 0, - u'preferred': 2, - u'text': 1, - u'total': 2}, - u'dataset_status': True, - u'dataset_type': 0, - u'description': u'', - u'excluded_fields': [], - u'fields_meta': { u'count': 1, - u'limit': 1000, - u'offset': 0, - u'query_total': 1, - u'total': 1}, - u'input_fields': [u'000001'], - u'locale': u'en_US', - u'max_columns': 2, - u'max_rows': 656, - u'name': u"spam dataset's Topic Model ", - u'number_of_batchtopicdistributions': 0, - u'number_of_public_topicdistributions': 0, - u'number_of_topicdistributions': 0, - u'ordering': 0, - u'out_of_bag': False, - u'price': 0.0, - u'private': True, - u'project': None, - u'range': [1, 656], - u'replacement': False, - u'resource': u'topicmodel/58362aaa983efc45a1000007', - u'rows': 656, - u'sample_rate': 1.0, - u'shared': False, - u'size': 54740, - u'source': u'source/58362a69983efc459f000001', - u'source_status': True, - u'status': { u'code': 5, - u'elapsed': 3222, - u'message': u'The topic model has been created', - u'progress': 1.0}, - u'subscription': True, - u'tags': [], - u'topic_model': { u'alpha': 4.166666666666667, - u'beta': 0.1, - u'bigrams': False, - u'case_sensitive': False, - u'fields': { u'000001': { u'column_number': 1, - u'datatype': u'string', - u'name': u'Message', - u'optype': u'text', - u'order': 0, - u'preferred': True, - u'summary': { u'average_length': 78.14787, - u'missing_count': 0, - u'tag_cloud': [ [ u'call', - 72], - [ u'ok', - 36], - [ u'gt', - 34], - ... - [ u'worse', - 2], - [ u'worth', - 2], - [ u'write', - 2], - [ u'yest', - 2], - [ u'yijue', - 2]], - u'term_forms': { }}, - u'term_analysis': { u'case_sensitive': False, - u'enabled': True, - u'language': u'en', - u'stem_words': False, - u'token_mode': u'all', - u'use_stopwords': False}}}, - u'hashed_seed': 62146850, - u'language': u'en', - u'number_of_topics': 12, - u'term_limit': 4096, - u'term_topic_assignments': [ [ 0, - 5, - 0, - 1, - 0, - 19, - 0, - 0, - 19, - 0, - 1, - 0], - [ 0, - 0, - 0, - 13, - 0, - 0, - 0, - 0, - 5, - 0, - 0, - 0], - ... - [ 0, - 7, - 27, - 0, - 112, - 0, - 0, - 0, - 0, - 0, - 14, - 2]], - u'termset': [ u'000', - u'03', - u'04', - u'06', - u'08000839402', - u'08712460324', - ... - - u'yes', - u'yest', - u'yesterday', - u'yijue', - u'yo', - u'yr', - u'yup', - u'\xfc'], - u'top_n_terms': 10, - u'topicmodel_seed': u'26c386d781963ca1ea5c90dab8a6b023b5e1d180', - u'topics': [ { u'id': u'000000', - u'name': u'Topic 00', - u'probability': 0.09375, - u'top_terms': [ [ u'im', - 0.04849], - [ u'hi', - 0.04717], - [ u'love', - 0.04585], - [ u'please', - 0.02867], - [ u'tomorrow', - 0.02867], - [ u'cos', - 0.02823], - [ u'sent', - 0.02647], - [ u'da', - 0.02383], - [ u'meet', - 0.02207], - [ u'dinner', - 0.01898]]}, - { u'id': u'000001', - u'name': u'Topic 01', - u'probability': 0.08215, - u'top_terms': [ [ u'lt', - 0.1015], - [ u'gt', - 0.1007], - [ u'wish', - 0.03958], - [ u'feel', - 0.0272], - [ u'shit', - 0.02361], - [ u'waiting', - 0.02281], - [ u'stuff', - 0.02001], - [ u'name', - 0.01921], - [ u'comp', - 0.01522], - [ u'forgot', - 0.01482]]}, - ... - { u'id': u'00000b', - u'name': u'Topic 11', - u'probability': 0.0826, - u'top_terms': [ [ u'call', - 0.15084], - [ u'min', - 0.05003], - [ u'msg', - 0.03185], - [ u'home', - 0.02648], - [ u'mind', - 0.02152], - [ u'lt', - 0.01987], - [ u'bring', - 0.01946], - [ u'camera', - 0.01905], - [ u'set', - 0.01905], - [ u'contact', - 0.01781]]}], - u'use_stopwords': False}, - u'updated': u'2016-11-23T23:48:03.336000', - u'white_box': False} - -Note that the output in the snippet above has been abbreviated. - - -The topic model returns a list of top terms for each topic found in the data. -Note that topics are not labeled, so you have to infer their meaning according -to the words they are composed of. - -Once you build the topic model you can calculate each topic probability -for a given document by using Topic Distribution. -This information can be useful to find documents similarities based -on their thematic. -As you see, -the ``topic_model`` attribute stores the topics and termset and term to -topic assignment, -as well as the configuration parameters described in -the `developers section `_ . +You can check the fusion properties at the `API documentation +`_. Time Series ------------ +~~~~~~~~~~~ A time series model is a supervised learning method to forecast the future values of a field based on its previously observed values. @@ -2568,449 +1917,1489 @@ The JSON structure for a time series is: .. code-block:: python >>> api.pprint(time_series['object']) - { u'category': 0, - u'clones': 0, - u'code': 200, - u'columns': 1, - u'configuration': None, - u'configuration_status': False, - u'created': u'2017-07-15T12:49:42.601000', - u'credits': 0.0, - u'dataset': u'dataset/5968ec42983efc21b0000016', - u'dataset_field_types': { u'categorical': 0, - u'datetime': 0, - u'effective_fields': 6, - u'items': 0, - u'numeric': 6, - u'preferred': 6, - u'text': 0, - u'total': 6}, - u'dataset_status': True, - u'dataset_type': 0, - u'description': u'', - u'fields_meta': { u'count': 1, - u'limit': 1000, - u'offset': 0, - u'query_total': 1, - u'total': 1}, - u'forecast': { u'000005': [ { u'lower_bound': [ 30.14111, + { 'category': 0, + 'clones': 0, + 'code': 200, + 'columns': 1, + 'configuration': None, + 'configuration_status': False, + 'created': '2017-07-15T12:49:42.601000', + 'credits': 0.0, + 'dataset': 'dataset/5968ec42983efc21b0000016', + 'dataset_field_types': { 'categorical': 0, + 'datetime': 0, + 'effective_fields': 6, + 'items': 0, + 'numeric': 6, + 'preferred': 6, + 'text': 0, + 'total': 6}, + 'dataset_status': True, + 'dataset_type': 0, + 'description': '', + 'fields_meta': { 'count': 1, + 'limit': 1000, + 'offset': 0, + 'query_total': 1, + 'total': 1}, + 'forecast': { '000005': [ { 'lower_bound': [ 30.14111, 30.14111, ... 30.14111], - u'model': u'A,N,N', - u'point_forecast': [ 68.53181, + 'model': 'A,N,N', + 'point_forecast': [ 68.53181, 68.53181, ... 68.53181, 68.53181], - u'time_range': { u'end': 129, - u'interval': 1, - u'interval_unit': u'milliseconds', - u'start': 80}, - u'upper_bound': [ 106.92251, + 'time_range': { 'end': 129, + 'interval': 1, + 'interval_unit': 'milliseconds', + 'start': 80}, + 'upper_bound': [ 106.92251, 106.92251, ... 106.92251, 106.92251]}, - { u'lower_bound': [ 35.44118, + { 'lower_bound': [ 35.44118, 35.5032, ... 35.28083], - u'model': u'A,Ad,N', + 'model': 'A,Ad,N', ... 66.83537, 66.9465], - u'time_range': { u'end': 129, - u'interval': 1, - u'interval_unit': u'milliseconds', - u'start': 80}}]}, - u'horizon': 50, - u'locale': u'en_US', - u'max_columns': 6, - u'max_rows': 80, - u'name': u'my_ts_data', - u'name_options': u'period=1, range=[1, 80]', - u'number_of_evaluations': 0, - u'number_of_forecasts': 0, - u'number_of_public_forecasts': 0, - u'objective_field': u'000005', - u'objective_field_name': u'Final', - u'objective_field_type': u'numeric', - u'objective_fields': [u'000005'], - u'objective_fields_names': [u'Final'], - u'price': 0.0, - u'private': True, - u'project': None, - u'range': [1, 80], - u'resource': u'timeseries/596a0f66983efc53f3000000', - u'rows': 80, - u'shared': False, - u'short_url': u'', - u'size': 2691, - u'source': u'source/5968ec3c983efc218c000006', - u'source_status': True, - u'status': { u'code': 5, - u'elapsed': 8358, - u'message': u'The time series has been created', - u'progress': 1.0}, - u'subscription': True, - u'tags': [], - u'time_series': { u'all_numeric_objectives': False, - u'datasets': { u'000005': u'dataset/596a0f70983efc53f3000003'}, - u'ets_models': { u'000005': [ { u'aic': 831.30903, - u'aicc': 831.84236, - u'alpha': 0.00012, - u'beta': 0, - u'bic': 840.83713, - u'final_state': { u'b': 0, - u'l': 68.53181, - u's': [ 0]}, - u'gamma': 0, - u'initial_state': { u'b': 0, - u'l': 68.53217, - u's': [ 0]}, - u'name': u'A,N,N', - u'period': 1, - u'phi': 1, - u'r_squared': -0.0187, - u'sigma': 19.19535}, - { u'aic': 834.43049, + 'time_range': { 'end': 129, + 'interval': 1, + 'interval_unit': 'milliseconds', + 'start': 80}}]}, + 'horizon': 50, + 'locale': 'en_US', + 'max_columns': 6, + 'max_rows': 80, + 'name': 'my_ts_data', + 'name_options': 'period=1, range=[1, 80]', + 'number_of_evaluations': 0, + 'number_of_forecasts': 0, + 'number_of_public_forecasts': 0, + 'objective_field': '000005', + 'objective_field_name': 'Final', + 'objective_field_type': 'numeric', + 'objective_fields': ['000005'], + 'objective_fields_names': ['Final'], + 'price': 0.0, + 'private': True, + 'project': None, + 'range': [1, 80], + 'resource': 'timeseries/596a0f66983efc53f3000000', + 'rows': 80, + 'shared': False, + 'short_url': '', + 'size': 2691, + 'source': 'source/5968ec3c983efc218c000006', + 'source_status': True, + 'status': { 'code': 5, + 'elapsed': 8358, + 'message': 'The time series has been created', + 'progress': 1.0}, + 'subscription': True, + 'tags': [], + 'time_series': { 'all_numeric_objectives': False, + 'datasets': { '000005': 'dataset/596a0f70983efc53f3000003'}, + 'ets_models': { '000005': [ { 'aic': 831.30903, + 'aicc': 831.84236, + 'alpha': 0.00012, + 'beta': 0, + 'bic': 840.83713, + 'final_state': { 'b': 0, + 'l': 68.53181, + 's': [ 0]}, + 'gamma': 0, + 'initial_state': { 'b': 0, + 'l': 68.53217, + 's': [ 0]}, + 'name': 'A,N,N', + 'period': 1, + 'phi': 1, + 'r_squared': -0.0187, + 'sigma': 19.19535}, + { 'aic': 834.43049, ... - u'slope': 0.11113, - u'value': 61.39}]}, - u'fields': { u'000005': { u'column_number': 5, - u'datatype': u'double', - u'name': u'Final', - u'optype': u'numeric', - u'order': 0, - u'preferred': True, - u'summary': { u'bins': [ [ 28.06, + 'slope': 0.11113, + 'value': 61.39}]}, + 'fields': { '000005': { 'column_number': 5, + 'datatype': 'double', + 'name': 'Final', + 'optype': 'numeric', + 'order': 0, + 'preferred': True, + 'summary': { 'bins': [ [ 28.06, 1], [ 34.44, ... [ 108.335, 2]], ... - u'sum_squares': 389814.3944, - u'variance': 380.73315}}}, - u'period': 1, - u'time_range': { u'end': 79, - u'interval': 1, - u'interval_unit': u'milliseconds', - u'start': 0}}, - u'type': 0, - u'updated': u'2017-07-15T12:49:52.549000', - u'white_box': False} + 'sum_squares': 389814.3944, + 'variance': 380.73315}}}, + 'period': 1, + 'time_range': { 'end': 79, + 'interval': 1, + 'interval_unit': 'milliseconds', + 'start': 0}}, + 'type': 0, + 'updated': '2017-07-15T12:49:52.549000', + 'white_box': False} You can check the time series properties at the `API documentation -`_. - - - -OptiMLs -------- +`_. -An OptiML is the result of an automated optimization process to find the -best model (type and configuration) to solve a particular -classification or regression problem. -The selection process automates the usual time-consuming task of trying -different models and parameters and evaluating their results to find the -best one. Using the OptiML, non-experts can build top-performing models. +Unsupervised Models +------------------- -You can create an OptiML selecting the ojective field to be predicted, the -evaluation metric to be used to rank the models tested in the process and -a maximum time for the task to be run. +Cluster +~~~~~~~ -The JSON structure for an OptiML is: +For unsupervised learning problems, the cluster is used to classify in a +limited number of groups your training data. The cluster structure is defined +by the centers of each group of data, named centroids, and the data enclosed +in the group. As for in the model's case, the cluster is a white-box resource +and can be retrieved as a JSON: .. code-block:: python - >>> api.pprint(optiml["object"]) - { u'category': 0, - u'code': 200, - u'configuration': None, - u'configuration_status': False, - u'created': u'2018-05-17T20:23:00.060000', - u'creator': u'mmartin', - u'dataset': u'dataset/5afdb7009252732d930009e8', - u'dataset_status': True, - u'datasets': [ u'dataset/5afde6488bf7d551ee00081c', - u'dataset/5afde6488bf7d551fd00511f', - u'dataset/5afde6488bf7d551fe002e0f', - ... - u'dataset/5afde64d8bf7d551fd00512e'], - u'description': u'', - u'evaluations': [ u'evaluation/5afde65c8bf7d551fd00514c', - u'evaluation/5afde65c8bf7d551fd00514f', - ... - u'evaluation/5afde6628bf7d551fd005161'], - u'excluded_fields': [], - u'fields_meta': { u'count': 5, - u'limit': 1000, - u'offset': 0, - u'query_total': 5, - u'total': 5}, - u'input_fields': [u'000000', u'000001', u'000002', u'000003'], - u'model_count': { u'logisticregression': 1, u'model': 8, u'total': 9}, - u'models': [ u'model/5afde64e8bf7d551fd005131', - u'model/5afde64f8bf7d551fd005134', - u'model/5afde6518bf7d551fd005137', - u'model/5afde6538bf7d551fd00513a', - u'logisticregression/5afde6558bf7d551fd00513d', - ... - u'model/5afde65a8bf7d551fd005149'], - u'models_meta': { u'count': 9, u'limit': 1000, u'offset': 0, u'total': 9}, - u'name': u'iris', - u'name_options': u'9 total models (logisticregression: 1, model: 8), metric=max_phi, model candidates=18, max. training time=300', - u'objective_field': u'000004', - u'objective_field_details': { u'column_number': 4, - u'datatype': u'string', - u'name': u'species', - u'optype': u'categorical', - u'order': 4}, - u'objective_field_name': u'species', - u'objective_field_type': u'categorical', - u'objective_fields': [u'000004'], - u'optiml': { u'created_resources': { u'dataset': 10, - u'logisticregression': 11, - u'logisticregression_evaluation': 11, - u'model': 29, - u'model_evaluation': 29}, - u'datasets': [ { u'id': u'dataset/5afde6488bf7d551ee00081c', - u'name': u'iris', - u'name_options': u'120 instances, 5 fields (1 categorical, 4 numeric), sample rate=0.8'}, - { u'id': u'dataset/5afde6488bf7d551fd00511f', - u'name': u'iris', - u'name_options': u'30 instances, 5 fields (1 categorical, 4 numeric), sample rate=0.2, out of bag'}, - { u'id': u'dataset/5afde6488bf7d551fe002e0f', - u'name': u'iris', - u'name_options': u'120 instances, 5 fields (1 categorical, 4 numeric), sample rate=0.8'}, - ... - { u'id': u'dataset/5afde64d8bf7d551fd00512e', - u'name': u'iris', - u'name_options': u'120 instances, 5 fields (1 categorical, 4 numeric), sample rate=0.8'}], - u'fields': { u'000000': { u'column_number': 0, - u'datatype': u'double', - u'name': u'sepal length', - u'optype': u'numeric', - u'order': 0, - u'preferred': True, - u'summary': { u'bins': [ [ 4.3, - 1], - ... - [ 7.9, - 1]], - ... - u'sum': 179.9, - u'sum_squares': 302.33, - u'variance': 0.58101}}, - u'000004': { u'column_number': 4, - u'datatype': u'string', - u'name': u'species', - u'optype': u'categorical', - u'order': 4, - u'preferred': True, - u'summary': { u'categories': [ [ u'Iris-setosa', - 50], - [ u'Iris-versicolor', - 50], - [ u'Iris-virginica', - 50]], - u'missing_count': 0}, - u'term_analysis': { u'enabled': True}}}, - u'max_training_time': 300, - u'metric': u'max_phi', - u'model_types': [u'model', u'logisticregression'], - u'models': [ { u'evaluation': { u'id': u'evaluation/5afde65c8bf7d551fd00514c', - u'info': { u'accuracy': 0.96667, - u'average_area_under_pr_curve': 0.97867, - ... - u'per_class_statistics': [ { u'accuracy': 1, - u'area_under_pr_curve': 1, - ... - u'spearmans_rho': 0.82005}]}, - u'metric_value': 0.95356, - u'metric_variance': 0.00079, - u'name': u'iris vs. iris', - u'name_options': u'279-node, deterministic order, operating kind=probability'}, - u'evaluation_count': 3, - u'id': u'model/5afde64e8bf7d551fd005131', - u'importance': [ [ u'000002', - 0.70997], - [ u'000003', - 0.27289], - [ u'000000', - 0.0106], - [ u'000001', - 0.00654]], - u'kind': u'model', - u'name': u'iris', - u'name_options': u'279-node, deterministic order'}, - { u'evaluation': { u'id': u'evaluation/5afde65c8bf7d551fd00514f', - u'info': { u'accuracy': 0.93333, - - ... - [ u'000001', - 0.02133]], - u'kind': u'model', - u'name': u'iris', - u'name_options': u'12-node, randomize, deterministic order, balanced'}], - u'number_of_model_candidates': 18, - u'recent_evaluations': [ 0.90764, - 0.94952, - ... - 0.90427], - u'search_complete': True, - u'summary': { u'logisticregression': { u'best': u'logisticregression/5afde6558bf7d551fd00513d', - u'count': 1}, - u'model': { u'best': u'model/5afde64e8bf7d551fd005131', - u'count': 8}}}, - u'private': True, - u'project': None, - u'resource': u'optiml/5afde4a42a83475c1b0008a2', - u'shared': False, - u'size': 3686, - u'source': u'source/5afdb6fb9252732d930009e5', - u'source_status': True, - u'status': { u'code': 5, - u'elapsed': 448878.0, - u'message': u'The optiml has been created', - u'progress': 1}, - u'subscription': False, - u'tags': [], - u'test_dataset': None, - u'type': 0, - u'updated': u'2018-05-17T20:30:29.063000'} - + >>> cluster = api.get_cluster(cluster) + >>> api.pprint(cluster['object']) + { 'balance_fields': True, + 'category': 0, + 'cluster_datasets': { '000000': '', '000001': '', '000002': ''}, + 'cluster_datasets_ids': { '000000': '53739b9ae4b0dad82b0a65e6', + '000001': '53739b9ae4b0dad82b0a65e7', + '000002': '53739b9ae4b0dad82b0a65e8'}, + 'cluster_seed': '2c249dda00fbf54ab4cdd850532a584f286af5b6', + 'clusters': { 'clusters': [ { 'center': { '000000': 58.5, + '000001': 26.8314, + '000002': 44.27907, + '000003': 14.37209}, + 'count': 56, + 'distance': { 'bins': [ [ 0.69602, + 2], + [ ... ] + [ 3.77052, + 1]], + 'maximum': 3.77052, + 'mean': 1.61711, + 'median': 1.52146, + 'minimum': 0.69237, + 'population': 56, + 'standard_deviation': 0.6161, + 'sum': 90.55805, + 'sum_squares': 167.31926, + 'variance': 0.37958}, + 'id': '000000', + 'name': 'Cluster 0'}, + { 'center': { '000000': 50.06, + '000001': 34.28, + '000002': 14.62, + '000003': 2.46}, + 'count': 50, + 'distance': { 'bins': [ [ 0.16917, + 1], + [ ... ] + [ 4.94699, + 1]], + 'maximum': 4.94699, + 'mean': 1.50725, + 'median': 1.3393, + 'minimum': 0.16917, + 'population': 50, + 'standard_deviation': 1.00994, + 'sum': 75.36252, + 'sum_squares': 163.56918, + 'variance': 1.01998}, + 'id': '000001', + 'name': 'Cluster 1'}, + { 'center': { '000000': 68.15625, + '000001': 31.25781, + '000002': 55.48438, + '000003': 19.96875}, + 'count': 44, + 'distance': { 'bins': [ [ 0.36825, + 1], + [ ... ] + [ 3.87216, + 1]], + 'maximum': 3.87216, + 'mean': 1.67264, + 'median': 1.63705, + 'minimum': 0.36825, + 'population': 44, + 'standard_deviation': 0.78905, + 'sum': 73.59627, + 'sum_squares': 149.87194, + 'variance': 0.6226}, + 'id': '000002', + 'name': 'Cluster 2'}], + 'fields': { '000000': { 'column_number': 0, + 'datatype': 'int8', + 'name': 'sepal length', + 'optype': 'numeric', + 'order': 0, + 'preferred': True, + 'summary': { 'bins': [ [ 43.75, + 4], + [ ... ] + [ 79, + 1]], + 'maximum': 79, + 'mean': 58.43333, + 'median': 57.7889, + 'minimum': 43, + 'missing_count': 0, + 'population': 150, + 'splits': [ 45.15258, + 46.72525, + 72.04226, + 76.47461], + 'standard_deviation': 8.28066, + 'sum': 8765, + 'sum_squares': 522385, + 'variance': 68.56935}}, + [ ... ] + [ 25, + 3]], + 'maximum': 25, + 'mean': 11.99333, + 'median': 13.28483, + 'minimum': 1, + 'missing_count': 0, + 'population': 150, + 'standard_deviation': 7.62238, + 'sum': 1799, + 'sum_squares': 30233, + 'variance': 58.10063}}}}, + 'code': 202, + 'columns': 4, + 'created': '2014-05-14T16:36:40.993000', + 'credits': 0.017578125, + 'credits_per_prediction': 0.0, + 'dataset': 'dataset/53739b88c8db63122b000411', + 'dataset_field_types': { 'categorical': 1, + 'datetime': 0, + 'numeric': 4, + 'preferred': 5, + 'text': 0, + 'total': 5}, + 'dataset_status': True, + 'dataset_type': 0, + 'description': '', + 'excluded_fields': ['000004'], + 'field_scales': None, + 'fields_meta': { 'count': 4, + 'limit': 1000, + 'offset': 0, + 'query_total': 4, + 'total': 4}, + 'input_fields': ['000000', '000001', '000002', '000003'], + 'k': 3, + 'locale': 'es-ES', + 'max_columns': 5, + 'max_rows': 150, + 'name': 'my iris', + 'number_of_batchcentroids': 0, + 'number_of_centroids': 0, + 'number_of_public_centroids': 0, + 'out_of_bag': False, + 'price': 0.0, + 'private': True, + 'range': [1, 150], + 'replacement': False, + 'resource': 'cluster/53739b98d994972da7001de9', + 'rows': 150, + 'sample_rate': 1.0, + 'scales': { '000000': 0.22445382597655375, + '000001': 0.4264213814821549, + '000002': 0.10528680248949522, + '000003': 0.2438379900517961}, + 'shared': False, + 'size': 4608, + 'source': 'source/53739b24d994972da7001ddd', + 'source_status': True, + 'status': { 'code': 5, + 'elapsed': 1009, + 'message': 'The cluster has been created', + 'progress': 1.0}, + 'subscription': True, + 'tags': [], + 'updated': '2014-05-14T16:40:26.234728', + 'white_box': False} + +(Note that we have abbreviated the output in the snippet above for +readability: the full predictive cluster yo'll get is going to contain +much more details). + +You can check the cluster properties at the `API documentation +`_. + +Anomaly detector +~~~~~~~~~~~~~~~~ + +For anomaly detection problems, BigML anomaly detector uses iforest as an +unsupervised kind of model that detects anomalous data in a dataset. The +information it returns encloses a `top_anomalies` block +that contains a list of the most anomalous +points. For each, we capture a `score` from 0 to 1. The closer to 1, +the more anomalous. We also capture the `row` which gives values for +each field in the order defined by `input_fields`. Similarly we give +a list of `importances` which match the `row` values. These +importances tell us which values contributed most to the anomaly +score. Thus, the structure of an anomaly detector is similar to: + +.. code-block:: python + + { 'category': 0, + 'code': 200, + 'columns': 14, + 'constraints': False, + 'created': '2014-09-08T18:51:11.893000', + 'credits': 0.11653518676757812, + 'credits_per_prediction': 0.0, + 'dataset': 'dataset/540dfa9d9841fa5c88000765', + 'dataset_field_types': { 'categorical': 21, + 'datetime': 0, + 'numeric': 21, + 'preferred': 14, + 'text': 0, + 'total': 42}, + 'dataset_status': True, + 'dataset_type': 0, + 'description': '', + 'excluded_fields': [], + 'fields_meta': { 'count': 14, + 'limit': 1000, + 'offset': 0, + 'query_total': 14, + 'total': 14}, + 'forest_size': 128, + 'input_fields': [ '000004', + '000005', + '000009', + '000016', + '000017', + '000018', + '000019', + '00001e', + '00001f', + '000020', + '000023', + '000024', + '000025', + '000026'], + 'locale': 'en_US', + 'max_columns': 42, + 'max_rows': 200, + 'model': { 'fields': { '000004': { 'column_number': 4, + 'datatype': 'int16', + 'name': 'src_bytes', + 'optype': 'numeric', + 'order': 0, + 'preferred': True, + 'summary': { 'bins': [ [ 143, + 2], + ... + [ 370, + 2]], + 'maximum': 370, + 'mean': 248.235, + 'median': 234.57157, + 'minimum': 141, + 'missing_count': 0, + 'population': 200, + 'splits': [ 159.92462, + 173.73312, + 188, + ... + 339.55228], + 'standard_deviation': 49.39869, + 'sum': 49647, + 'sum_squares': 12809729, + 'variance': 2440.23093}}, + '000005': { 'column_number': 5, + 'datatype': 'int32', + 'name': 'dst_bytes', + 'optype': 'numeric', + 'order': 1, + 'preferred': True, + ... + 'sum': 1030851, + 'sum_squares': 22764504759, + 'variance': 87694652.45224}}, + '000009': { 'column_number': 9, + 'datatype': 'string', + 'name': 'hot', + 'optype': 'categorical', + 'order': 2, + 'preferred': True, + 'summary': { 'categories': [ [ '0', + 199], + [ '1', + 1]], + 'missing_count': 0}, + 'term_analysis': { 'enabled': True}}, + '000016': { 'column_number': 22, + 'datatype': 'int8', + 'name': 'count', + 'optype': 'numeric', + 'order': 3, + 'preferred': True, + ... + 'population': 200, + 'standard_deviation': 5.42421, + 'sum': 1351, + 'sum_squares': 14981, + 'variance': 29.42209}}, + '000017': { ... }}}, + 'kind': 'iforest', + 'mean_depth': 12.314174107142858, + 'top_anomalies': [ { 'importance': [ 0.06768, + 0.01667, + 0.00081, + 0.02437, + 0.04773, + 0.22197, + 0.18208, + 0.01868, + 0.11855, + 0.01983, + 0.01898, + 0.05306, + 0.20398, + 0.00562], + 'row': [ 183.0, + 8654.0, + '0', + 4.0, + 4.0, + 0.25, + 0.25, + 0.0, + 123.0, + 255.0, + 0.01, + 0.04, + 0.01, + 0.0], + 'score': 0.68782}, + { 'importance': [ 0.05645, + 0.02285, + 0.0015, + 0.05196, + 0.04435, + 0.0005, + 0.00056, + 0.18979, + 0.12402, + 0.23671, + 0.20723, + 0.05651, + 0.00144, + 0.00612], + 'row': [ 212.0, + 1940.0, + '0', + 1.0, + 2.0, + 0.0, + 0.0, + 1.0, + 1.0, + 69.0, + 1.0, + 0.04, + 0.0, + 0.0], + 'score': 0.6239}, + ...], + 'trees': [ { 'root': { 'children': [ { 'children': [ { 'children': [ { 'children': [ { 'children': + [ { 'population': 1, + 'predicates': [ { 'field': '00001f', + 'op': '>', + 'value': 35.54357}]}, + + ... + { 'population': 1, + 'predicates': [ { 'field': '00001f', + 'op': '<=', + 'value': 35.54357}]}], + 'population': 2, + 'predicates': [ { 'field': '000005', + 'op': '<=', + 'value': 1385.5166}]}], + 'population': 3, + 'predicates': [ { 'field': '000020', + 'op': '<=', + 'value': 65.14308}, + { 'field': '000019', + 'op': '=', + 'value': 0}]}], + 'population': 105, + 'predicates': [ { 'field': '000017', + 'op': '<=', + 'value': 13.21754}, + { 'field': '000009', + 'op': 'in', + 'value': [ '0']}]}], + 'population': 126, + 'predicates': [ True, + { 'field': '000018', + 'op': '=', + 'value': 0}]}, + 'training_mean_depth': 11.071428571428571}]}, + 'name': "tiny_kdd's dataset anomaly detector", + 'number_of_batchscores': 0, + 'number_of_public_predictions': 0, + 'number_of_scores': 0, + 'out_of_bag': False, + 'price': 0.0, + 'private': True, + 'project': None, + 'range': [1, 200], + 'replacement': False, + 'resource': 'anomaly/540dfa9f9841fa5c8800076a', + 'rows': 200, + 'sample_rate': 1.0, + 'sample_size': 126, + 'seed': 'BigML', + 'shared': False, + 'size': 30549, + 'source': 'source/540dfa979841fa5c7f000363', + 'source_status': True, + 'status': { 'code': 5, + 'elapsed': 32397, + 'message': 'The anomaly detector has been created', + 'progress': 1.0}, + 'subscription': False, + 'tags': [], + 'updated': '2014-09-08T23:54:28.647000', + 'white_box': False} + +Note that we have abbreviated the output in the snippet above for +readability: the full anomaly detector yo'll get is going to contain +much more details). + +The `trees` list contains the actual isolation forest, and it can be quite +large usually. That's why, this part of the resource should only be included +in downloads when needed. If you are only interested in other properties, such +as `top_anomalies`, yo'll improve performance by excluding it, using the +`excluded=trees` query string in the API call: + +.. code-block:: python + + anomaly = api.get_anomaly('anomaly/540dfa9f9841fa5c8800076a', \ + query_string='excluded=trees') + +Each node in an isolation tree can have multiple predicates. +For the node to be a valid branch when evaluated with a data point, all of its +predicates must be true. + +You can check the anomaly detector properties at the `API documentation +`_. + +Associations +~~~~~~~~~~~~ + +Association Discovery is a popular method to find out relations among values +in high-dimensional datasets. + +A common case where association discovery is often used is +market basket analysis. This analysis seeks for customer shopping +patterns across large transactional +datasets. For instance, do customers who buy hamburgers and ketchup also +consume bread? + +Businesses use those insights to make decisions on promotions and product +placements. +Association Discovery can also be used for other purposes such as early +incident detection, web usage analysis, or software intrusion detection. + +In BigML, the Association resource object can be built from any dataset, and +its results are a list of association rules between the items in the dataset. +In the example case, the corresponding +association rule would have hamburguers and ketchup as the items at the +left hand side of the association rule and bread would be the item at the +right hand side. Both sides in this association rule are related, +in the sense that observing +the items in the left hand side implies observing the items in the right hand +side. There are some metrics to ponder the quality of these association rules: + +- Support: the proportion of instances which contain an itemset. + +For an association rule, it means the number of instances in the dataset which +contain the rule's antecedent and rule's consequent together +over the total number of instances (N) in the dataset. + +It gives a measure of the importance of the rule. Association rules have +to satisfy a minimum support constraint (i.e., min_support). + +- Coverage: the support of the antedecent of an association rule. +It measures how often a rule can be applied. + +- Confidence or (strength): The probability of seeing the rule's consequent +under the condition that the instances also contain the rule's antecedent. +Confidence is computed using the support of the association rule over the +coverage. That is, the percentage of instances which contain the consequent +and antecedent together over the number of instances which only contain +the antecedent. + +Confidence is directed and gives different values for the association +rules Antecedent → Consequent and Consequent → Antecedent. Association +rules also need to satisfy a minimum confidence constraint +(i.e., min_confidence). + +- Leverage: the difference of the support of the association +rule (i.e., the antecedent and consequent appearing together) and what would +be expected if antecedent and consequent where statistically independent. +This is a value between -1 and 1. A positive value suggests a positive +relationship and a negative value suggests a negative relationship. +0 indicates independence. + +Lift: how many times more often antecedent and consequent occur together +than expected if they where statistically independent. +A value of 1 suggests that there is no relationship between the antecedent +and the consequent. Higher values suggest stronger positive relationships. +Lower values suggest stronger negative relationships (the presence of the +antecedent reduces the likelihood of the consequent) + +As to the items used in association rules, each type of field is parsed to +extract items for the rules as follows: + +- Categorical: each different value (class) will be considered a separate item. +- Text: each unique term will be considered a separate item. +- Items: each different item in the items summary will be considered. +- Numeric: Values will be converted into categorical by making a +segmentation of the values. +For example, a numeric field with values ranging from 0 to 600 split +into 3 segments: +segment 1 → [0, 200), segment 2 → [200, 400), segment 3 → [400, 600]. +You can refine the behavior of the transformation using +`discretization `_ +and `field_discretizations `_. + +The JSON structure for an association resource is: + +.. code-block:: python + + + >>> api.pprint(association['object']) + { + "associations":{ + "complement":false, + "discretization":{ + "pretty":true, + "size":5, + "trim":0, + "type":"width" + }, + "items":[ + { + "complement":false, + "count":32, + "field_id":"000000", + "name":"Segment 1", + "bin_end":5, + "bin_start":null + }, + { + "complement":false, + "count":49, + "field_id":"000000", + "name":"Segment 3", + "bin_end":7, + "bin_start":6 + }, + { + "complement":false, + "count":12, + "field_id":"000000", + "name":"Segment 4", + "bin_end":null, + "bin_start":7 + }, + { + "complement":false, + "count":19, + "field_id":"000001", + "name":"Segment 1", + "bin_end":2.5, + "bin_start":null + }, + ... + { + "complement":false, + "count":50, + "field_id":"000004", + "name":"Iris-versicolor" + }, + { + "complement":false, + "count":50, + "field_id":"000004", + "name":"Iris-virginica" + } + ], + "max_k": 100, + "min_confidence":0, + "min_leverage":0, + "min_lift":1, + "min_support":0, + "rules":[ + { + "confidence":1, + "id":"000000", + "leverage":0.22222, + "lhs":[ + 13 + ], + "lhs_cover":[ + 0.33333, + 50 + ], + "lift":3, + "p_value":0.000000000, + "rhs":[ + 6 + ], + "rhs_cover":[ + 0.33333, + 50 + ], + "support":[ + 0.33333, + 50 + ] + }, + { + "confidence":1, + "id":"000001", + "leverage":0.22222, + "lhs":[ + 6 + ], + "lhs_cover":[ + 0.33333, + 50 + ], + "lift":3, + "p_value":0.000000000, + "rhs":[ + 13 + ], + "rhs_cover":[ + 0.33333, + 50 + ], + "support":[ + 0.33333, + 50 + ] + }, + ... + { + "confidence":0.26, + "id":"000029", + "leverage":0.05111, + "lhs":[ + 13 + ], + "lhs_cover":[ + 0.33333, + 50 + ], + "lift":2.4375, + "p_value":0.0000454342, + "rhs":[ + 5 + ], + "rhs_cover":[ + 0.10667, + 16 + ], + "support":[ + 0.08667, + 13 + ] + }, + { + "confidence":0.18, + "id":"00002a", + "leverage":0.04, + "lhs":[ + 15 + ], + "lhs_cover":[ + 0.33333, + 50 + ], + "lift":3, + "p_value":0.0000302052, + "rhs":[ + 9 + ], + "rhs_cover":[ + 0.06, + 9 + ], + "support":[ + 0.06, + 9 + ] + }, + { + "confidence":1, + "id":"00002b", + "leverage":0.04, + "lhs":[ + 9 + ], + "lhs_cover":[ + 0.06, + 9 + ], + "lift":3, + "p_value":0.0000302052, + "rhs":[ + 15 + ], + "rhs_cover":[ + 0.33333, + 50 + ], + "support":[ + 0.06, + 9 + ] + } + ], + "rules_summary":{ + "confidence":{ + "counts":[ + [ + 0.18, + 1 + ], + [ + 0.24, + 1 + ], + [ + 0.26, + 2 + ], + ... + [ + 0.97959, + 1 + ], + [ + 1, + 9 + ] + ], + "maximum":1, + "mean":0.70986, + "median":0.72864, + "minimum":0.18, + "population":44, + "standard_deviation":0.24324, + "sum":31.23367, + "sum_squares":24.71548, + "variance":0.05916 + }, + "k":44, + "leverage":{ + "counts":[ + [ + 0.04, + 2 + ], + [ + 0.05111, + 4 + ], + [ + 0.05316, + 2 + ], + ... + [ + 0.22222, + 2 + ] + ], + "maximum":0.22222, + "mean":0.10603, + "median":0.10156, + "minimum":0.04, + "population":44, + "standard_deviation":0.0536, + "sum":4.6651, + "sum_squares":0.61815, + "variance":0.00287 + }, + "lhs_cover":{ + "counts":[ + [ + 0.06, + 2 + ], + [ + 0.08, + 2 + ], + [ + 0.10667, + 4 + ], + [ + 0.12667, + 1 + ], + ... + [ + 0.5, + 4 + ] + ], + "maximum":0.5, + "mean":0.29894, + "median":0.33213, + "minimum":0.06, + "population":44, + "standard_deviation":0.13386, + "sum":13.15331, + "sum_squares":4.70252, + "variance":0.01792 + }, + "lift":{ + "counts":[ + [ + 1.40625, + 2 + ], + [ + 1.5067, + 2 + ], + ... + [ + 2.63158, + 4 + ], + [ + 3, + 10 + ], + [ + 4.93421, + 2 + ], + [ + 12.5, + 2 + ] + ], + "maximum":12.5, + "mean":2.91963, + "median":2.58068, + "minimum":1.40625, + "population":44, + "standard_deviation":2.24641, + "sum":128.46352, + "sum_squares":592.05855, + "variance":5.04635 + }, + "p_value":{ + "counts":[ + [ + 0.000000000, + 2 + ], + [ + 0.000000000, + 4 + ], + [ + 0.000000000, + 2 + ], + ... + [ + 0.0000910873, + 2 + ] + ], + "maximum":0.0000910873, + "mean":0.0000106114, + "median":0.00000000, + "minimum":0.000000000, + "population":44, + "standard_deviation":0.0000227364, + "sum":0.000466903, + "sum_squares":0.0000000, + "variance":0.000000001 + }, + "rhs_cover":{ + "counts":[ + [ + 0.06, + 2 + ], + [ + 0.08, + 2 + ], + ... + [ + 0.42667, + 2 + ], + [ + 0.46667, + 3 + ], + [ + 0.5, + 4 + ] + ], + "maximum":0.5, + "mean":0.29894, + "median":0.33213, + "minimum":0.06, + "population":44, + "standard_deviation":0.13386, + "sum":13.15331, + "sum_squares":4.70252, + "variance":0.01792 + }, + "support":{ + "counts":[ + [ + 0.06, + 4 + ], + [ + 0.06667, + 2 + ], + [ + 0.08, + 2 + ], + [ + 0.08667, + 4 + ], + [ + 0.10667, + 4 + ], + [ + 0.15333, + 2 + ], + [ + 0.18667, + 4 + ], + [ + 0.19333, + 2 + ], + [ + 0.20667, + 2 + ], + [ + 0.27333, + 2 + ], + [ + 0.28667, + 2 + ], + [ + 0.3, + 4 + ], + [ + 0.32, + 2 + ], + [ + 0.33333, + 6 + ], + [ + 0.37333, + 2 + ] + ], + "maximum":0.37333, + "mean":0.20152, + "median":0.19057, + "minimum":0.06, + "population":44, + "standard_deviation":0.10734, + "sum":8.86668, + "sum_squares":2.28221, + "variance":0.01152 + } + }, + "search_strategy":"leverage", + "significance_level":0.05 + }, + "category":0, + "clones":0, + "code":200, + "columns":5, + "created":"2015-11-05T08:06:08.184000", + "credits":0.017581939697265625, + "dataset":"dataset/562fae3f4e1727141d00004e", + "dataset_status":true, + "dataset_type":0, + "description":"", + "excluded_fields":[ ], + "fields_meta":{ + "count":5, + "limit":1000, + "offset":0, + "query_total":5, + "total":5 + }, + "input_fields":[ + "000000", + "000001", + "000002", + "000003", + "000004" + ], + "locale":"en_US", + "max_columns":5, + "max_rows":150, + "name":"iris' dataset's association", + "out_of_bag":false, + "price":0, + "private":true, + "project":null, + "range":[ + 1, + 150 + ], + "replacement":false, + "resource":"association/5621b70910cb86ae4c000000", + "rows":150, + "sample_rate":1, + "shared":false, + "size":4609, + "source":"source/562fae3a4e1727141d000048", + "source_status":true, + "status":{ + "code":5, + "elapsed":1072, + "message":"The association has been created", + "progress":1 + }, + "subscription":false, + "tags":[ ], + "updated":"2015-11-05T08:06:20.403000", + "white_box":false + } +Note that the output in the snippet above has been abbreviated. As you see, +the ``associations`` attribute stores items, rules and metrics extracted +from the datasets as well as the configuration parameters described in +the `developers section `_ . -You can check the optiml properties at the `API documentation -`_. +Topic Models +~~~~~~~~~~~~ -Fusions -------- +A topic model is an unsupervised machine learning method +for unveiling all the different topics +underlying a collection of documents. +BigML uses Latent Dirichlet Allocation (LDA), one of the most popular +probabilistic methods for topic modeling. +In BigML, each instance (i.e. each row in your dataset) will +be considered a document and the contents of all the text fields +given as inputs will be automatically concatenated and considered the +document bag of words. -A Fusion is a special type of composed resource for which all -submodels satisfy the following constraints: they're all either -classifications or regressions over the same kind of data or -compatible fields, with the same objective field. Given those -properties, a fusion can be considered a supervised model, -and therefore one can predict with fusions and evaluate them. -Ensembles can be viewed as a kind of fusion subject to the additional -constraints that all its submodels are tree models that, moreover, -have been built from the same base input data, but sampled in particular ways. +Topic model is based on the assumption that any document +exhibits a mixture of topics. Each topic is composed of a set of words +which are thematically related. The words from a given topic have different +probabilities for that topic. At the same time, each word can be attributable +to one or several topics. So for example the word "sea" may be found in +a topic related with sea transport but also in a topic related to holidays. +Topic model automatically discards stop words and high +frequency words. -The model types allowed to be a submodel of a fusion are: -deepnet, ensemble, fusion, model, logistic regression and linear regression. +Topic model's main applications include browsing, organizing and understanding +large archives of documents. It can been applied for information retrieval, +collaborative filtering, assessing document similarity among others. +The topics found in the dataset can also be very useful new features +before applying other models like classification, clustering, or +anomaly detection. -The JSON structure for an Fusion is: +The JSON structure for a topic model is: .. code-block:: python - >>> api.pprint(fusion["object"]) - { - "category": 0, - "code": 200, - "configuration": null, - "configuration_status": false, - "created": "2018-05-09T20:11:05.821000", - "credits_per_prediction": 0, - "description": "", - "fields_meta": { - "count": 5, - "limit": 1000, - "offset": 0, - "query_total": 5, - "total": 5 - }, - "fusion": { - "models": [ - { - "id": "ensemble/5af272eb4e1727d378000050", - "kind": "ensemble", - "name": "Iris ensemble", - "name_options": "boosted trees, 1999-node, 16-iteration, deterministic order, balanced" - }, - { - "id": "model/5af272fe4e1727d3780000d6", - "kind": "model", - "name": "Iris model", - "name_options": "1999-node, pruned, deterministic order, balanced" - }, - { - "id": "logisticregression/5af272ff4e1727d3780000d9", - "kind": "logisticregression", - "name": "Iris LR", - "name_options": "L2 regularized (c=1), bias, auto-scaled, missing values, eps=0.001" - } - ] - }, - "importance": { - "000000": 0.05847, - "000001": 0.03028, - "000002": 0.13582, - "000003": 0.4421 - }, - "model_count": { - "ensemble": 1, - "logisticregression": 1, - "model": 1, - "total": 3 - }, - "models": [ - "ensemble/5af272eb4e1727d378000050", - "model/5af272fe4e1727d3780000d6", - "logisticregression/5af272ff4e1727d3780000d9" - ], - "models_meta": { - "count": 3, - "limit": 1000, - "offset": 0, - "total": 3 - }, - "name": "iris", - "name_options": "3 total models (ensemble: 1, logisticregression: 1, model: 1)", - "number_of_batchpredictions": 0, - "number_of_evaluations": 0, - "number_of_predictions": 0, - "number_of_public_predictions": 0, - "objective_field": "000004", - "objective_field_details": { - "column_number": 4, - "datatype": "string", - "name": "species", - "optype": "categorical", - "order": 4 - }, - "objective_field_name": "species", - "objective_field_type": "categorical", - "objective_fields": [ - "000004" - ], - "private": true, - "project": null, - "resource":"fusion/59af8107b8aa0965d5b61138", - "shared": false, - "status": { - "code": 5, - "elapsed": 8420, - "message": "The fusion has been created", - "progress": 1 - }, - "subscription": false, - "tags": [], - "type": 0, - "updated": "2018-05-09T20:11:14.258000" - } + >>> api.pprint(topic['object']) + { 'category': 0, + 'code': 200, + 'columns': 1, + 'configuration': None, + 'configuration_status': False, + 'created': '2016-11-23T23:47:54.703000', + 'credits': 0.0, + 'credits_per_prediction': 0.0, + 'dataset': 'dataset/58362aa0983efc45a0000005', + 'dataset_field_types': { 'categorical': 1, + 'datetime': 0, + 'effective_fields': 672, + 'items': 0, + 'numeric': 0, + 'preferred': 2, + 'text': 1, + 'total': 2}, + 'dataset_status': True, + 'dataset_type': 0, + 'description': '', + 'excluded_fields': [], + 'fields_meta': { 'count': 1, + 'limit': 1000, + 'offset': 0, + 'query_total': 1, + 'total': 1}, + 'input_fields': ['000001'], + 'locale': 'en_US', + 'max_columns': 2, + 'max_rows': 656, + 'name': u"spam dataset's Topic Model ", + 'number_of_batchtopicdistributions': 0, + 'number_of_public_topicdistributions': 0, + 'number_of_topicdistributions': 0, + 'ordering': 0, + 'out_of_bag': False, + 'price': 0.0, + 'private': True, + 'project': None, + 'range': [1, 656], + 'replacement': False, + 'resource': 'topicmodel/58362aaa983efc45a1000007', + 'rows': 656, + 'sample_rate': 1.0, + 'shared': False, + 'size': 54740, + 'source': 'source/58362a69983efc459f000001', + 'source_status': True, + 'status': { 'code': 5, + 'elapsed': 3222, + 'message': 'The topic model has been created', + 'progress': 1.0}, + 'subscription': True, + 'tags': [], + 'topic_model': { 'alpha': 4.166666666666667, + 'beta': 0.1, + 'bigrams': False, + 'case_sensitive': False, + 'fields': { '000001': { 'column_number': 1, + 'datatype': 'string', + 'name': 'Message', + 'optype': 'text', + 'order': 0, + 'preferred': True, + 'summary': { 'average_length': 78.14787, + 'missing_count': 0, + 'tag_cloud': [ [ 'call', + 72], + [ 'ok', + 36], + [ 'gt', + 34], + ... + [ 'worse', + 2], + [ 'worth', + 2], + [ 'write', + 2], + [ 'yest', + 2], + [ 'yijue', + 2]], + 'term_forms': { }}, + 'term_analysis': { 'case_sensitive': False, + 'enabled': True, + 'language': 'en', + 'stem_words': False, + 'token_mode': 'all', + 'use_stopwords': False}}}, + 'hashed_seed': 62146850, + 'language': 'en', + 'number_of_topics': 12, + 'term_limit': 4096, + 'term_topic_assignments': [ [ 0, + 5, + 0, + 1, + 0, + 19, + 0, + 0, + 19, + 0, + 1, + 0], + [ 0, + 0, + 0, + 13, + 0, + 0, + 0, + 0, + 5, + 0, + 0, + 0], + ... + [ 0, + 7, + 27, + 0, + 112, + 0, + 0, + 0, + 0, + 0, + 14, + 2]], + 'termset': [ '000', + '03', + '04', + '06', + '08000839402', + '08712460324', + ... -You can check the fusion properties at the `API documentation -`_. + 'yes', + 'yest', + 'yesterday', + 'yijue', + 'yo', + 'yr', + 'yup', + '\xfc'], + 'top_n_terms': 10, + 'topicmodel_seed': '26c386d781963ca1ea5c90dab8a6b023b5e1d180', + 'topics': [ { 'id': '000000', + 'name': 'Topic 00', + 'probability': 0.09375, + 'top_terms': [ [ 'im', + 0.04849], + [ 'hi', + 0.04717], + [ 'love', + 0.04585], + [ 'please', + 0.02867], + [ 'tomorrow', + 0.02867], + [ 'cos', + 0.02823], + [ 'sent', + 0.02647], + [ 'da', + 0.02383], + [ 'meet', + 0.02207], + [ 'dinner', + 0.01898]]}, + { 'id': '000001', + 'name': 'Topic 01', + 'probability': 0.08215, + 'top_terms': [ [ 'lt', + 0.1015], + [ 'gt', + 0.1007], + [ 'wish', + 0.03958], + [ 'feel', + 0.0272], + [ 'shit', + 0.02361], + [ 'waiting', + 0.02281], + [ 'stuff', + 0.02001], + [ 'name', + 0.01921], + [ 'comp', + 0.01522], + [ 'forgot', + 0.01482]]}, + ... + { 'id': '00000b', + 'name': 'Topic 11', + 'probability': 0.0826, + 'top_terms': [ [ 'call', + 0.15084], + [ 'min', + 0.05003], + [ 'msg', + 0.03185], + [ 'home', + 0.02648], + [ 'mind', + 0.02152], + [ 'lt', + 0.01987], + [ 'bring', + 0.01946], + [ 'camera', + 0.01905], + [ 'set', + 0.01905], + [ 'contact', + 0.01781]]}], + 'use_stopwords': False}, + 'updated': '2016-11-23T23:48:03.336000', + 'white_box': False} + +Note that the output in the snippet above has been abbreviated. + + +The topic model returns a list of top terms for each topic found in the data. +Note that topics are not labeled, so you have to infer their meaning according +to the words they are composed of. + +Once you build the topic model you can calculate each topic probability +for a given document by using Topic Distribution. +This information can be useful to find documents similarities based +on their thematic. +As you see, +the ``topic_model`` attribute stores the topics and termset and term to +topic assignment, +as well as the configuration parameters described in +the `developers section `_ . PCAs ----- +~~~~ A PCA (Principal Component Analysis) resource fits a number of orthogonal projections (components) to maximally capture the variance in a dataset. This @@ -3026,43 +3415,43 @@ The JSON structure for an PCA is: {'code': 200, 'error': None, 'location': 'https://strato.dev.bigml.io/andromeda/pca/5c002572983efc0ac5000003', - 'object': {u'category': 0, - u'code': 200, - u'columns': 2, - u'configuration': None, - u'configuration_status': False, - u'created': u'2018-11-29T17:44:18.359000', - u'creator': u'merce', - u'credits': 0.0, - u'credits_per_prediction': 0.0, - u'dataset': u'dataset/5c00256a983efc0acf000000', - u'dataset_field_types': {u'categorical': 1, - u'datetime': 0, - u'items': 0, - u'numeric': 0, - u'preferred': 2, - u'text': 1, - u'total': 2}, - u'dataset_status': True, - u'description': u'', - u'excluded_fields': [], - u'fields_meta': {u'count': 2, - u'limit': 1000, - u'offset': 0, - u'query_total': 2, - u'total': 2}, - u'input_fields': [u'000000', u'000001'], - u'locale': u'en-us', - u'max_columns': 2, - u'max_rows': 7, - u'name': u'spam 4 words', - u'name_options': u'standardized', - u'number_of_batchprojections': 2, - u'number_of_projections': 0, - u'number_of_public_projections': 0, - u'ordering': 0, - u'out_of_bag': False, - u'pca': {u'components': [[-0.64757, + 'object': {'category': 0, + 'code': 200, + 'columns': 2, + 'configuration': None, + 'configuration_status': False, + 'created': '2018-11-29T17:44:18.359000', + 'creator': 'merce', + 'credits': 0.0, + 'credits_per_prediction': 0.0, + 'dataset': 'dataset/5c00256a983efc0acf000000', + 'dataset_field_types': {'categorical': 1, + 'datetime': 0, + 'items': 0, + 'numeric': 0, + 'preferred': 2, + 'text': 1, + 'total': 2}, + 'dataset_status': True, + 'description': '', + 'excluded_fields': [], + 'fields_meta': {'count': 2, + 'limit': 1000, + 'offset': 0, + 'query_total': 2, + 'total': 2}, + 'input_fields': ['000000', '000001'], + 'locale': 'en-us', + 'max_columns': 2, + 'max_rows': 7, + 'name': 'spam 4 words', + 'name_options': 'standardized', + 'number_of_batchprojections': 2, + 'number_of_projections': 0, + 'number_of_public_projections': 0, + 'ordering': 0, + 'out_of_bag': False, + 'pca': {'components': [[-0.64757, 0.83392, 0.1158, 0.83481, @@ -3070,63 +3459,422 @@ The JSON structure for an PCA is: -0.09426, -0.08544, -0.03457]], - u'cumulative_variance': [0.43667, + 'cumulative_variance': [0.43667, 0.74066, 0.87902, 0.98488, 0.99561, 1], - u'eigenvectors': [[-0.3894, + 'eigenvectors': [[-0.3894, 0.50146, 0.06963, ... -0.56542, -0.5125, -0.20734]], - u'fields': {u'000000': {u'column_number': 0, - u'datatype': u'string', - u'name': u'Type', + 'fields': {'000000': {'column_number': 0, + 'datatype': 'string', + 'name': 'Type', ... - u'token_mode': u'all', - u'use_stopwords': False}}}, - u'pca_seed': u'2c249dda00fbf54ab4cdd850532a584f286af5b6', - u'standardized': True, - u'text_stats': {u'000001': {u'means': [0.71429, + 'token_mode': 'all', + 'use_stopwords': False}}}, + 'pca_seed': '2c249dda00fbf54ab4cdd850532a584f286af5b6', + 'standardized': True, + 'text_stats': {'000001': {'means': [0.71429, 0.71429, 0.42857, 0.28571], - u'standard_deviations': [0.75593, + 'standard_deviations': [0.75593, 0.75593, 0.53452, 0.48795]}}, - u'variance': [0.43667, + 'variance': [0.43667, 0.30399, 0.13837, 0.10585, 0.01073, 0.00439]}, - u'price': 0.0, - u'private': True, - u'project': None, - u'range': None, - u'replacement': False, - u'resource': u'pca/5c002572983efc0ac5000003', - u'rows': 7, - u'sample_rate': 1.0, - u'shared': False, - u'size': 127, - u'source': u'source/5c00255e983efc0acd00001b', - u'source_status': True, - u'status': {u'code': 5, - u'elapsed': 1571, - u'message': u'The pca has been created', - u'progress': 1}, - u'subscription': True, - u'tags': [], - u'type': 0, - u'updated': u'2018-11-29T18:13:19.714000', - u'white_box': False}, - 'resource': u'pca/5c002572983efc0ac5000003'} + 'price': 0.0, + 'private': True, + 'project': None, + 'range': None, + 'replacement': False, + 'resource': 'pca/5c002572983efc0ac5000003', + 'rows': 7, + 'sample_rate': 1.0, + 'shared': False, + 'size': 127, + 'source': 'source/5c00255e983efc0acd00001b', + 'source_status': True, + 'status': {'code': 5, + 'elapsed': 1571, + 'message': 'The pca has been created', + 'progress': 1}, + 'subscription': True, + 'tags': [], + 'type': 0, + 'updated': '2018-11-29T18:13:19.714000', + 'white_box': False}, + 'resource': 'pca/5c002572983efc0ac5000003'} You can check the PCA properties at the `API documentation -`_. +`_. + +Predictions and Evaluations +--------------------------- + +Prediction +~~~~~~~~~~ + +The output of a supervised learning model for a particular input is its +prediction. In BigML, a model is ready to produce predictions immediately, so +there's no need of a special deployment in order to start using it. Here's how +you create a prediction for a model and its response: + +.. code-block:: python + + >>> input_data = {"petal length": 4} + >>> prediction = api.create_prediction(model_id, input_data) + >>> api.pprint(prediction["object"]) + { 'boosted_ensemble': False, + 'category': 12, + 'code': 201, + 'confidence': 0.40383, + 'confidence_bounds': {}, + 'confidences': [ ['Iris-setosa', 0], + ['Iris-versicolor', 0.40383], + ['Iris-virginica', 0.40383]], + 'configuration': None, + 'configuration_status': False, + 'created': '2024-09-09T15:48:58.918313', + 'creator': 'mmartin', + 'dataset': 'dataset/6668805ad7413f90007ab83e', + 'dataset_status': True, + 'description': 'Created using BigMLer', + 'expanded_input_data': {'000002': 4.0}, + 'explanation': None, + 'fields': { '000002': { 'column_number': 2, + 'datatype': 'double', + 'name': 'petal length', + 'optype': 'numeric', + 'order': 2, + 'preferred': True}, + '000003': { 'column_number': 3, + 'datatype': 'double', + 'name': 'petal width', + 'optype': 'numeric', + 'order': 3, + 'preferred': True}, + '000004': { 'column_number': 4, + 'datatype': 'string', + 'name': 'species', + 'optype': 'categorical', + 'order': 4, + 'preferred': True, + 'term_analysis': {'enabled': True}}}, + 'importance': {'000002': 1}, + 'input_data': {'petal length': 4}, + 'locale': 'en_US', + 'missing_strategy': 0, + 'model': 'model/6668805f002883f09483369d', + 'model_status': True, + 'model_type': 0, + 'name': 'iris.csv', + 'name_options': 'operating kind=probability, 1 inputs', + 'number_of_models': 1, + 'objective_field': '000004', + 'objective_field_name': 'species', + 'objective_field_type': 'categorical', + 'objective_fields': ['000004'], + 'operating_kind': 'probability', + 'output': 'Iris-versicolor', + 'prediction': {'000004': 'Iris-versicolor'}, + 'prediction_path': { 'confidence': 0.40383, + 'next_predicates': [ { 'count': 46, + 'field': '000003', + 'operator': '>', + 'value': 1.75}, + { 'count': 54, + 'field': '000003', + 'operator': '<=', + 'value': 1.75}], + 'node_id': 1, + 'objective_summary': { 'categories': [ [ 'Iris-versicolor', + 50], + [ 'Iris-virginica', + 50]]}, + 'path': [ { 'field': '000002', + 'operator': '>', + 'value': 2.45}]}, + 'private': True, + 'probabilities': [ ['Iris-setosa', 0.0033], + ['Iris-versicolor', 0.49835], + ['Iris-virginica', 0.49835]], + 'probability': 0.49835, + 'project': None, + 'query_string': '', + 'resource': 'prediction/66df18eac6f7849b7b3f10ec', + 'shared': False, + 'source': 'source/66688055450bc914a2c147e0', + 'source_status': True, + 'status': { 'code': 5, + 'elapsed': 227, + 'message': 'The prediction has been created', + 'progress': 1}, + 'subscription': True, + 'tags': ['BigMLer', 'BigMLer_TueJun1124_094957'], + 'task': 'classification', + 'type': 0, + 'updated': '2024-09-09T15:48:58.918335'} + +As you see, +the ``output`` attribute stores the prediction value and the ``confidence`` +and ``probability`` attributes show the respective values. The rest of the +dictionary contains the configuration parameters described in +the `developers section `_. + +Evaluation +~~~~~~~~~~ + +The predictive performance of a model can be measured using many different +measures. In BigML these measures can be obtained by creating evaluations. To +create an evaluation you need the id of the model you are evaluating and the id +of the dataset that contains the data to be tested with. The result is shown +as: + +.. code-block:: python + + >>> evaluation = api.get_evaluation(evaluation) + >>> api.pprint(evaluation['object']['result']) + { 'class_names': ['0', '1'], + 'mode': { 'accuracy': 0.9802, + 'average_f_measure': 0.495, + 'average_phi': 0, + 'average_precision': 0.5, + 'average_recall': 0.4901, + 'confusion_matrix': [[99, 0], [2, 0]], + 'per_class_statistics': [ { 'accuracy': 0.9801980198019802, + 'class_name': '0', + 'f_measure': 0.99, + 'phi_coefficient': 0, + 'precision': 1.0, + 'present_in_test_data': True, + 'recall': 0.9801980198019802}, + { 'accuracy': 0.9801980198019802, + 'class_name': '1', + 'f_measure': 0, + 'phi_coefficient': 0, + 'precision': 0.0, + 'present_in_test_data': True, + 'recall': 0}]}, + 'model': { 'accuracy': 0.9901, + 'average_f_measure': 0.89746, + 'average_phi': 0.81236, + 'average_precision': 0.99495, + 'average_recall': 0.83333, + 'confusion_matrix': [[98, 1], [0, 2]], + 'per_class_statistics': [ { 'accuracy': 0.9900990099009901, + 'class_name': '0', + 'f_measure': 0.9949238578680203, + 'phi_coefficient': 0.8123623944599232, + 'precision': 0.98989898989899, + 'present_in_test_data': True, + 'recall': 1.0}, + { 'accuracy': 0.9900990099009901, + 'class_name': '1', + 'f_measure': 0.8, + 'phi_coefficient': 0.8123623944599232, + 'precision': 1.0, + 'present_in_test_data': True, + 'recall': 0.6666666666666666}]}, + 'random': { 'accuracy': 0.50495, + 'average_f_measure': 0.36812, + 'average_phi': 0.13797, + 'average_precision': 0.74747, + 'average_recall': 0.51923, + 'confusion_matrix': [[49, 50], [0, 2]], + 'per_class_statistics': [ { 'accuracy': 0.504950495049505, + 'class_name': '0', + 'f_measure': 0.6621621621621622, + 'phi_coefficient': 0.1379728923974526, + 'precision': 0.494949494949495, + 'present_in_test_data': True, + 'recall': 1.0}, + { 'accuracy': 0.504950495049505, + 'class_name': '1', + 'f_measure': 0.07407407407407407, + 'phi_coefficient': 0.1379728923974526, + 'precision': 1.0, + 'present_in_test_data': True, + 'recall': 0.038461538461538464}]}} + +where two levels of detail are easily identified. For classifications, +the first level shows these keys: + +- **class_names**: A list with the names of all the categories for the objective field (i.e., all the classes) +- **mode**: A detailed result object. Measures of the performance of the classifier that predicts the mode class for all the instances in the dataset +- **model**: A detailed result object. +- **random**: A detailed result object. Measures the performance of the classifier that predicts a random class for all the instances in the dataset. + +and the detailed result objects include ``accuracy``, ``average_f_measure``, ``average_phi``, +``average_precision``, ``average_recall``, ``confusion_matrix`` +and ``per_class_statistics``. + +For regressions first level will contain these keys: + +- **mean**: A detailed result object. Measures the performance of the model that predicts the mean for all the instances in the dataset. +- **model**: A detailed result object. +- **random**: A detailed result object. Measures the performance of the model that predicts a random class for all the instances in the dataset. + +where the detailed result objects include ``mean_absolute_error``, +``mean_squared_error`` and ``r_squared`` (refer to +`developers documentation `_ for +more info on the meaning of these measures. + +You can check the evaluation properties at the `API documentation +`_. + +Centroid +~~~~~~~~ + +A ``centroid`` is the value predicted by a cluster model. Here's how to create +a centroid: + + +.. code-block:: python + + >>> input_data = {"petal length": 4} + >>> centroid = api.create_centroid(cluster_id, input_data) + +Mind that you will need to provide values for all the input fields in order to +create a centroid. To know more details about the centroid properties and +parameters you can check the corresponding +`API documentation `_. + +Anomaly Score +~~~~~~~~~~~~~ + +An ``anomaly score`` is the value predicted by an anomaly detector. +Here's how to create an anomaly score: + + +.. code-block:: python + + >>> input_data = {"petal length": 4} + >>> anomaly_score = api.create_anomaly_score(anomaly_id, input_data) + +To know more details about the anomaly score properties and +parameters you can check the corresponding +`API documentation `_. + +Association Set +~~~~~~~~~~~~~~~ + +An ``association set`` is the value predicted by an association discovery model. +Here's how to create an association set: + + +.. code-block:: python + + >>> input_data = {"petal length": 4} + >>> association_set = api.create_association_set(association_id, input_data) + +To know more details about the association set properties and +parameters you can check the corresponding +`API documentation `_. + +Topic Distribution +~~~~~~~~~~~~~~~~~~ + +A ``topic distribution`` is the value predicted by a topic model. +Here's how to create a topic distribution: + + +.. code-block:: python + + >>> input_data = {"text": "Now is the winter of our discontent"} + >>> topic_model = api.create_topic_model(topic_model_id, input_data) + +To know more details about the topic distribution properties and +parameters you can check the corresponding +`API documentation `_. + +Batch Prediction +~~~~~~~~~~~~~~~~ + +In BigML, you can create predictions for all the inputs provided as rows of a +dataset, i.e. a batch prediction. +The result of a batch prediction can either be downloaded as a CSV or +become a new dataset. As with predictions, a model is ready to produce batch +predictions immediately, so there's no need of a special deployment in order +to start using it. Here's how you create a batch prediction for a model +and its response: + +.. code-block:: python + + >>> batch_prediction = api.create_batch_prediction(model_id, test_dataset) + +To know more details about the batch prediction properties and +parameters you can check the corresponding +`API documentation `_. + +Batch Centroid +~~~~~~~~~~~~~~ + +In BigML, you can create centroids for all the inputs provided as rows of a +dataset, i.e. a batch centroid. +The result of a batch centroid can either be downloaded as a CSV or +become a new dataset. As with predictions, a cluster is ready to produce batch +centroids immediately, so there's no need of a special deployment in order +to start using it. Here's how you create a batch centroid for a cluster +and its response: + +.. code-block:: python + + >>> batch_centroid = api.create_batch_centroid(cluster_id, test_dataset) + +To know more details about the batch centroid properties and +parameters you can check the corresponding +`API documentation `_. + +Batch Anomaly Score +~~~~~~~~~~~~~~~~~~~ + +In BigML, you can create anomaly scores for all the inputs provided as rows of a +dataset, i.e. a batch anomaly score. +The result of a batch anomaly score can either be downloaded as a CSV or +become a new dataset. As with predictions, an anomaly detector +is ready to produce batch anomaly scores immediately, +so there's no need of a special deployment in order +to start using it. Here's how you create a batch anomaly score for an anomaly +detector and its response: + +.. code-block:: python + + >>> batch_anomaly_score = api.create_batch_anomaly_score( + anomaly_id, test_dataset) + +To know more details about the batch anomaly score properties and +parameters you can check the corresponding +`API documentation `_. + +Batch Topic Distribution +~~~~~~~~~~~~~~~~~~~~~~~~ + +In BigML, you can create topic distributions for all the inputs +provided as rows of a dataset, i.e. a batch topic distribution. +The result of a batch topic distribution can either be downloaded as a CSV or +become a new dataset. As with predictions, a topic model is ready to produce +batch topic distributions immediately, so there's no need of a +special deployment in order to start using it. +Here's how you create a batch topic distribution for a topic model +and its response: + +.. code-block:: python + + >>> batch_topic_distribution = api.create_batch_topic_distribution( + topic_id, test_dataset) + +To know more details about the batch topic distribution properties and +parameters you can check the corresponding +`API documentation `_. diff --git a/docs/quick_start.rst b/docs/quick_start.rst new file mode 100644 index 00000000..2ff7b0ac --- /dev/null +++ b/docs/quick_start.rst @@ -0,0 +1,284 @@ +Quick Start +=========== + +Imagine that you want to use `this csv +file `_ containing the `Iris +flower dataset `_ to +predict the species of a flower whose ``petal length`` is ``2.45`` and +whose ``petal width`` is ``1.75``. A preview of the dataset is shown +below. It has 4 numeric fields: ``sepal length``, ``sepal width``, +``petal length``, ``petal width`` and a categorical field: ``species``. +By default, BigML considers the last field in the dataset as the +objective field (i.e., the field that you want to generate predictions +for). + +:: + + sepal length,sepal width,petal length,petal width,species + 5.1,3.5,1.4,0.2,Iris-setosa + 4.9,3.0,1.4,0.2,Iris-setosa + 4.7,3.2,1.3,0.2,Iris-setosa + ... + 5.8,2.7,3.9,1.2,Iris-versicolor + 6.0,2.7,5.1,1.6,Iris-versicolor + 5.4,3.0,4.5,1.5,Iris-versicolor + ... + 6.8,3.0,5.5,2.1,Iris-virginica + 5.7,2.5,5.0,2.0,Iris-virginica + 5.8,2.8,5.1,2.4,Iris-virginica + +You can easily generate a prediction following these steps: + +.. code-block:: python + + from bigml.api import BigML + + api = BigML() + + source = api.create_source('./data/iris.csv') + dataset = api.create_dataset(source) + model = api.create_model(dataset) + prediction = api.create_prediction(model, \ + {"petal width": 1.75, "petal length": 2.45}) + +You can then print the prediction using the ``pprint`` method: + +.. code-block:: python + + >>> api.pprint(prediction) + species for {"petal width": 1.75, "petal length": 2.45} is Iris-setosa + +Certainly, any of the resources created in BigML can be configured using +several arguments described in the `API documentation `_. +Any of these configuration arguments can be added to the ``create`` method +as a dictionary in the last optional argument of the calls: + +.. code-block:: python + + from bigml.api import BigML + + api = BigML() + + source_args = {"name": "my source", + "source_parser": {"missing_tokens": ["NULL"]}} + source = api.create_source('./data/iris.csv', source_args) + dataset_args = {"name": "my dataset"} + dataset = api.create_dataset(source, dataset_args) + model_args = {"objective_field": "species"} + model = api.create_model(dataset, model_args) + prediction_args = {"name": "my prediction"} + prediction = api.create_prediction(model, \ + {"petal width": 1.75, "petal length": 2.45}, + prediction_args) + +The ``iris`` dataset has a small number of instances, and usually will be +instantly created, so the ``api.create_`` calls will probably return the +finished resources outright. As BigML's API is asynchronous, +in general you will need to ensure +that objects are finished before using them by using ``api.ok``. + +.. code-block:: python + + from bigml.api import BigML + + api = BigML() + + source = api.create_source('./data/iris.csv') + api.ok(source) + dataset = api.create_dataset(source) + api.ok(dataset) + model = api.create_model(dataset) + api.ok(model) + prediction = api.create_prediction(model, \ + {"petal width": 1.75, "petal length": 2.45}) + +Note that the prediction +call is not followed by the ``api.ok`` method. Predictions are so quick to be +generated that, unlike the +rest of resouces, will be generated synchronously as a finished object. + +Alternatively to the ``api.ok`` method, BigML offers +`webhooks `_ that can be set +when creating a resource and will call the url of you choice when the +finished or failed event is reached. A secret can be included in the call to +verify the webhook call authenticity, and a + +.. code-block:: python + + bigml.webhooks.check_signature(request, signature) + +function is offered to that end. As an example, this snippet creates a source +and sets a webhook to call ``https://my_webhook.com/endpoint`` when finished: + +.. code-block:: python + + from bigml.api import BigML + api = BigML() + # using a webhook with a secret + api.create_source("https://static.bigml.com/csv/iris.csv", + {"webhook": {"url": "https://my_webhook.com/endpoint", + "secret": "mysecret"}}) + + +The ``iris`` prediction example assumed that your objective +field (the one you want to predict) is the last field in the dataset. +If that's not he case, you can explicitly +set the name of this field in the creation call using the ``objective_field`` +argument: + + +.. code-block:: python + + from bigml.api import BigML + + api = BigML() + + source = api.create_source('./data/iris.csv') + api.ok(source) + dataset = api.create_dataset(source) + api.ok(dataset) + model = api.create_model(dataset, {"objective_field": "species"}) + api.ok(model) + prediction = api.create_prediction(model, \ + {'sepal length': 5, 'sepal width': 2.5}) + + +You can also generate an evaluation for the model by using: + +.. code-block:: python + + test_source = api.create_source('./data/test_iris.csv') + api.ok(test_source) + test_dataset = api.create_dataset(test_source) + api.ok(test_dataset) + evaluation = api.create_evaluation(model, test_dataset) + api.ok(evaluation) + + +The API object also offers the ``create``, ``get``, ``update`` and ``delete`` +generic methods to manage all type of resources. The type of resource to be +created is passed as first argument to the ``create`` method; + +.. code-block:: python + + from bigml.api import BigML + + api = BigML() + + source = api.create('source', './data/iris.csv') + source = api.update(source, {"name": "my new source name"}) + +Note that these methods don't need the ``api.ok`` method to be called +to wait for the resource to be finished. +The method waits internally for it by default. +This can be avoided by using ``finished=False`` as one of the arguments. + + +.. code-block:: python + + from bigml.api import BigML + + api = BigML() + + source = api.create('source', './data/iris.csv') + dataset = api.create('dataset', source, finished=False) # unfinished + api.ok(dataset) # waiting explicitly for the dataset to finish + dataset = api.update(dataset, {"name": "my_new_dataset_name"}, + finised=False) + api.ok(dataset) + +As an example for the ``delete`` and ``get`` methods, we could +create a batch prediction, put the predictions in a +dataset object and delete the ``batch_prediction``. + +.. code-block:: python + + from bigml.api import BigML + + api = BigML() + + batch_prediction = api.create('batchprediction', + 'model/5f3c3d2b5299637102000882', + 'dataset/5f29a563529963736c0116e9', + args={"output_dataset": True}) + batch_prediction_dataset = api.get(batch_prediction["object"][ \ + "output_dataset_resource"]) + api.delete(batch_prediction) + +If you set the ``storage`` argument in the ``api`` instantiation: + +.. code-block:: python + + api = BigML(storage='./storage') + +all the generated, updated or retrieved resources will be automatically +saved to the chosen directory. Once they are stored locally, the +``retrieve_resource`` method will look for the resource information +first in the local storage before trying to download the information from +the API. + +.. code-block:: python + + dataset = api.retrieve_resource("dataset/5e8e5672c7736e3d830037b5", + query_string="limit=-1") + + +Alternatively, you can use the ``export`` method to explicitly +download the JSON information +that describes any of your resources in BigML to a particular file: + +.. code-block:: python + + api.export('model/5acea49a08b07e14b9001068', + filename="my_dir/my_model.json") + +This example downloads the JSON for the model and stores it in +the ``my_dir/my_model.json`` file. + +In the case of models that can be represented in a `PMML` syntax, the +export method can be used to produce the corresponding `PMML` file. + +.. code-block:: python + + api.export('model/5acea49a08b07e14b9001068', + filename="my_dir/my_model.pmml", + pmml=True) + +You can also retrieve the last resource with some previously given tag: + +.. code-block:: python + + api.export_last("foo", + resource_type="ensemble", + filename="my_dir/my_ensemble.json") + +which selects the last ensemble that has a ``foo`` tag. This mechanism can +be specially useful when retrieving retrained models that have been created +with a shared unique keyword as tag. + +For a descriptive overview of the steps that you will usually need to +follow to model +your data and obtain predictions, please see the `basic Workflow sketch +`_ +document. You can also check other simple examples in the following documents: + +- `model 101 <101_model.html>`_ +- `logistic regression 101 <101_logistic_regression.html>`_ +- `linear regression 101 <101_linear_regression.html>`_ +- `ensemble 101 <101_ensemble.html>`_ +- `cluster 101 <101_cluster>`_ +- `anomaly detector 101 <101_anomaly.html>`_ +- `association 101 <101_association.html>`_ +- `topic model 101 <101_topic_model.html>`_ +- `deepnet 101 <101_deepnet.html>`_ +- `time series 101 <101_ts.html>`_ +- `fusion 101 <101_fusion.html>`_ +- `optiml 101 <101_optiml.html>`_ +- `PCA 101 <101_pca.html>`_ +- `scripting 101 <101_scripting.html>`_ + +And for examples on Image Processing: + +- `Images Classification 101 <101_images_classification.html>`_ +- `Object Detection 101<101_object_detection.html>`_ +- `Images Feature Extraction 101 <101_images_feature_extraction.html>`_ diff --git a/docs/reading_resources.rst b/docs/reading_resources.rst index e182915f..541125e4 100644 --- a/docs/reading_resources.rst +++ b/docs/reading_resources.rst @@ -30,6 +30,66 @@ that can be used to filter out or limit the attributes obtained: query_string="exclude=root") +Public and shared resources +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The previous examples use resources that were created by the same user +that asks for their retrieval or modification. If a user wants to share one +of her resources, she can make them public or share them. Declaring a resource +public means that anyone can see the resource. This can be applied to datasets +and models. To turn a dataset public, just update its ``private`` property: + +.. code-block:: python + + api.update_dataset('dataset/5143a51a37203f2cf7000972', {'private': false}) + +and any user will be able to download it using its id prepended by ``public``: + +.. code-block:: python + + api.get_dataset('public/dataset/5143a51a37203f2cf7000972') + +In the models' case, you can also choose if you want the model to be fully +downloadable or just accesible to make predictions. This is controlled with the +``white_box`` property. If you want to publish your model completely, just +use: + +.. code-block:: python + + api.update_model('model/5143a51a37203f2cf7000956', {'private': false, + 'white_box': true}) + +Both public models and datasets, will be openly accessible for anyone, +registered or not, from the web +gallery. + +Still, you may want to share your models with other users, but without making +them public for everyone. This can be achieved by setting the ``shared`` +property: + +.. code-block:: python + + api.update_model('model/5143a51a37203f2cf7000956', {'shared': true}) + +Shared models can be accessed using their share hash (propery ``shared_hash`` +in the original model): + +.. code-block:: python + + api.get_model('shared/model/d53iw39euTdjsgesj7382ufhwnD') + +or by using their original id with the creator user as username and a specific +sharing api_key you will find as property ``sharing_api_key`` in the updated +model: + +.. code-block:: python + + api.get_model('model/5143a51a37203f2cf7000956', shared_username='creator', + shared_api_key='c972018dc5f2789e65c74ba3170fda31d02e00c3') + +Only users with the share link or credentials information will be able to +access your shared models. + Listing Resources ----------------- @@ -178,63 +238,3 @@ Name of predictions ordered by name. [prediction['name'] for prediction in api.list_predictions("order_by=name")['objects']] - -Public and shared resources ---------------------------- - -The previous examples use resources that were created by the same user -that asks for their retrieval or modification. If a user wants to share one -of her resources, she can make them public or share them. Declaring a resource -public means that anyone can see the resource. This can be applied to datasets -and models. To turn a dataset public, just update its ``private`` property: - -.. code-block:: python - - api.update_dataset('dataset/5143a51a37203f2cf7000972', {'private': false}) - -and any user will be able to download it using its id prepended by ``public``: - -.. code-block:: python - - api.get_dataset('public/dataset/5143a51a37203f2cf7000972') - -In the models' case, you can also choose if you want the model to be fully -downloadable or just accesible to make predictions. This is controlled with the -``white_box`` property. If you want to publish your model completely, just -use: - -.. code-block:: python - - api.update_model('model/5143a51a37203f2cf7000956', {'private': false, - 'white_box': true}) - -Both public models and datasets, will be openly accessible for anyone, -registered or not, from the web -gallery. - -Still, you may want to share your models with other users, but without making -them public for everyone. This can be achieved by setting the ``shared`` -property: - -.. code-block:: python - - api.update_model('model/5143a51a37203f2cf7000956', {'shared': true}) - -Shared models can be accessed using their share hash (propery ``shared_hash`` -in the original model): - -.. code-block:: python - - api.get_model('shared/model/d53iw39euTdjsgesj7382ufhwnD') - -or by using their original id with the creator user as username and a specific -sharing api_key you will find as property ``sharing_api_key`` in the updated -model: - -.. code-block:: python - - api.get_model('model/5143a51a37203f2cf7000956', shared_username='creator', - shared_api_key='c972018dc5f2789e65c74ba3170fda31d02e00c3') - -Only users with the share link or credentials information will be able to -access your shared models. diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 00000000..6daf89af --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,2 @@ +sphinx +sphinx_rtd_theme==2.0.0 diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..1de495d4 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,8 @@ +[build-system] +requires=[ + "setuptools==69.0.0" +] + +[tool.black] +line-length = 80 +target-version = ['py312'] diff --git a/setup.py b/setup.py index 824fdea2..c7858b6c 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -# Copyright 2012-2023 BigML, Inc +# Copyright 2012-2025 BigML, Inc # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain @@ -30,7 +30,7 @@ open(version_py_path).read()).group(1) TOPIC_MODELING_DEPENDENCIES = ["cython", "pystemmer==2.2.0.1"] -IMAGES_DEPENDENCIES = ["bigml-sensenet==0.7.2"] +IMAGES_DEPENDENCIES = ["bigml-sensenet==0.7.5"] # Concatenate files into the long description file_contents = [] @@ -50,7 +50,8 @@ download_url="https://github.com/bigmlcom/python", license="http://www.apache.org/licenses/LICENSE-2.0", setup_requires = ['pytest'], - install_requires = ["unidecode", "bigml-chronos>=0.4.3", "requests", + install_requires = ["setuptools==70.0.0", "unidecode", + "bigml-chronos>=0.4.3", "requests", "requests-toolbelt", "msgpack", "numpy>=1.22", "scipy", "javascript"], extras_require={"images": IMAGES_DEPENDENCIES,