From 6024fc8a8f2212f618ef16bc43dd5aeab79b1dc0 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Fri, 10 Nov 2023 10:19:06 +0100 Subject: [PATCH 1/5] Add datasets/qualities/list endpoint --- src/main.py | 6 ++++-- src/routers/v1/qualities.py | 18 ++++++++++++++++++ 2 files changed, 22 insertions(+), 2 deletions(-) create mode 100644 src/routers/v1/qualities.py diff --git a/src/main.py b/src/main.py index 3c5d595c..44f7cefb 100644 --- a/src/main.py +++ b/src/main.py @@ -3,7 +3,8 @@ import uvicorn from fastapi import FastAPI from routers.mldcat_ap.dataset import router as mldcat_ap_router -from routers.v1.datasets import router as datasets_router_old_format +from routers.v1.datasets import router as datasets_router_v1_format +from routers.v1.qualities import router as qualities_router from routers.v2.datasets import router as datasets_router @@ -37,7 +38,8 @@ def create_api() -> FastAPI: app = FastAPI() app.include_router(datasets_router) - app.include_router(datasets_router_old_format) + app.include_router(datasets_router_v1_format) + app.include_router(qualities_router) app.include_router(mldcat_ap_router) return app diff --git a/src/routers/v1/qualities.py b/src/routers/v1/qualities.py new file mode 100644 index 00000000..fc568b96 --- /dev/null +++ b/src/routers/v1/qualities.py @@ -0,0 +1,18 @@ +from typing import Literal + +from fastapi import APIRouter + +router = APIRouter(prefix="/v1/datasets", tags=["datasets"]) + + +@router.get("/qualities/list") +def list_qualities() -> ( + dict[ + Literal["data_qualities_list"], + dict[ + Literal["quality"], + list[str], + ], + ] +): + return {"data_qualities_list": {"quality": []}} From caf60f22c703f9c1b5bee1f144b50102dad9ffe5 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Fri, 10 Nov 2023 10:32:57 +0100 Subject: [PATCH 2/5] Add static test for qualities/list --- tests/routers/v1/qualities_test.py | 122 +++++++++++++++++++++++++++++ 1 file changed, 122 insertions(+) create mode 100644 tests/routers/v1/qualities_test.py diff --git a/tests/routers/v1/qualities_test.py b/tests/routers/v1/qualities_test.py new file mode 100644 index 00000000..e85f56a1 --- /dev/null +++ b/tests/routers/v1/qualities_test.py @@ -0,0 +1,122 @@ +import http.client + +from starlette.testclient import TestClient + + +def test_list_qualities_identical(api_client: TestClient) -> None: + response = api_client.get("/v1/datasets/qualities/list") + assert response.status_code == http.client.OK + expected = { + "data_qualities_list": { + "quality": [ + "AutoCorrelation", + "CfsSubsetEval_DecisionStumpAUC", + "CfsSubsetEval_DecisionStumpErrRate", + "CfsSubsetEval_DecisionStumpKappa", + "CfsSubsetEval_NaiveBayesAUC", + "CfsSubsetEval_NaiveBayesErrRate", + "CfsSubsetEval_NaiveBayesKappa", + "CfsSubsetEval_kNN1NAUC", + "CfsSubsetEval_kNN1NErrRate", + "CfsSubsetEval_kNN1NKappa", + "ClassEntropy", + "DecisionStumpAUC", + "DecisionStumpErrRate", + "DecisionStumpKappa", + "Dimensionality", + "EquivalentNumberOfAtts", + "J48.00001.AUC", + "J48.00001.ErrRate", + "J48.00001.Kappa", + "J48.0001.AUC", + "J48.0001.ErrRate", + "J48.0001.Kappa", + "J48.001.AUC", + "J48.001.ErrRate", + "J48.001.Kappa", + "MajorityClassPercentage", + "MajorityClassSize", + "MaxAttributeEntropy", + "MaxKurtosisOfNumericAtts", + "MaxMeansOfNumericAtts", + "MaxMutualInformation", + "MaxNominalAttDistinctValues", + "MaxSkewnessOfNumericAtts", + "MaxStdDevOfNumericAtts", + "MeanAttributeEntropy", + "MeanKurtosisOfNumericAtts", + "MeanMeansOfNumericAtts", + "MeanMutualInformation", + "MeanNoiseToSignalRatio", + "MeanNominalAttDistinctValues", + "MeanSkewnessOfNumericAtts", + "MeanStdDevOfNumericAtts", + "MinAttributeEntropy", + "MinKurtosisOfNumericAtts", + "MinMeansOfNumericAtts", + "MinMutualInformation", + "MinNominalAttDistinctValues", + "MinSkewnessOfNumericAtts", + "MinStdDevOfNumericAtts", + "MinorityClassPercentage", + "MinorityClassSize", + "NaiveBayesAUC", + "NaiveBayesErrRate", + "NaiveBayesKappa", + "NumberOfBinaryFeatures", + "NumberOfClasses", + "NumberOfFeatures", + "NumberOfInstances", + "NumberOfInstancesWithMissingValues", + "NumberOfMissingValues", + "NumberOfNumericFeatures", + "NumberOfSymbolicFeatures", + "PercentageOfBinaryFeatures", + "PercentageOfInstancesWithMissingValues", + "PercentageOfMissingValues", + "PercentageOfNumericFeatures", + "PercentageOfSymbolicFeatures", + "Quartile1AttributeEntropy", + "Quartile1KurtosisOfNumericAtts", + "Quartile1MeansOfNumericAtts", + "Quartile1MutualInformation", + "Quartile1SkewnessOfNumericAtts", + "Quartile1StdDevOfNumericAtts", + "Quartile2AttributeEntropy", + "Quartile2KurtosisOfNumericAtts", + "Quartile2MeansOfNumericAtts", + "Quartile2MutualInformation", + "Quartile2SkewnessOfNumericAtts", + "Quartile2StdDevOfNumericAtts", + "Quartile3AttributeEntropy", + "Quartile3KurtosisOfNumericAtts", + "Quartile3MeansOfNumericAtts", + "Quartile3MutualInformation", + "Quartile3SkewnessOfNumericAtts", + "Quartile3StdDevOfNumericAtts", + "REPTreeDepth1AUC", + "REPTreeDepth1ErrRate", + "REPTreeDepth1Kappa", + "REPTreeDepth2AUC", + "REPTreeDepth2ErrRate", + "REPTreeDepth2Kappa", + "REPTreeDepth3AUC", + "REPTreeDepth3ErrRate", + "REPTreeDepth3Kappa", + "RandomTreeDepth1AUC", + "RandomTreeDepth1ErrRate", + "RandomTreeDepth1Kappa", + "RandomTreeDepth2AUC", + "RandomTreeDepth2ErrRate", + "RandomTreeDepth2Kappa", + "RandomTreeDepth3AUC", + "RandomTreeDepth3ErrRate", + "RandomTreeDepth3Kappa", + "StdvNominalAttDistinctValues", + "kNN1NAUC", + "kNN1NErrRate", + "kNN1NKappa", + ], + }, + } + assert expected == response.json() From 9b00308942bba41305168562772106b17c5a22d1 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Fri, 10 Nov 2023 11:15:28 +0100 Subject: [PATCH 3/5] Fetch dataset qualities from the database --- src/database/datasets.py | 14 ++++++++++++++ src/routers/v1/qualities.py | 27 +++++++++++++++------------ tests/routers/v1/qualities_test.py | 26 +++++++++++++++++++++++++- 3 files changed, 54 insertions(+), 13 deletions(-) diff --git a/src/database/datasets.py b/src/database/datasets.py index b3f453ff..62e7df97 100644 --- a/src/database/datasets.py +++ b/src/database/datasets.py @@ -6,6 +6,20 @@ from database.meta import get_column_names +def list_all_qualities(connection: Connection) -> list[str]: + # The current implementation only fetches *used* qualities, otherwise you should + # query: SELECT `name` FROM `quality` WHERE `type`='DataQuality' + qualities = connection.execute( + text( + """ + SELECT DISTINCT(`quality`) + FROM data_quality + """, + ), + ) + return [quality.quality for quality in qualities] + + def get_dataset(dataset_id: int, connection: Connection) -> dict[str, Any] | None: columns = get_column_names(connection, "dataset") row = connection.execute( diff --git a/src/routers/v1/qualities.py b/src/routers/v1/qualities.py index fc568b96..a00372d8 100644 --- a/src/routers/v1/qualities.py +++ b/src/routers/v1/qualities.py @@ -1,18 +1,21 @@ -from typing import Literal +from typing import Annotated, Literal -from fastapi import APIRouter +from database.datasets import list_all_qualities +from fastapi import APIRouter, Depends +from sqlalchemy import Connection + +from routers.dependencies import expdb_connection router = APIRouter(prefix="/v1/datasets", tags=["datasets"]) @router.get("/qualities/list") -def list_qualities() -> ( - dict[ - Literal["data_qualities_list"], - dict[ - Literal["quality"], - list[str], - ], - ] -): - return {"data_qualities_list": {"quality": []}} +def list_qualities( + expdb: Annotated[Connection, Depends(expdb_connection)], +) -> dict[Literal["data_qualities_list"], dict[Literal["quality"], list[str]]]: + qualities = list_all_qualities(connection=expdb) + return { + "data_qualities_list": { + "quality": qualities, + }, + } diff --git a/tests/routers/v1/qualities_test.py b/tests/routers/v1/qualities_test.py index e85f56a1..49947806 100644 --- a/tests/routers/v1/qualities_test.py +++ b/tests/routers/v1/qualities_test.py @@ -1,9 +1,10 @@ import http.client +from sqlalchemy import Connection, text from starlette.testclient import TestClient -def test_list_qualities_identical(api_client: TestClient) -> None: +def test_list_qualities_identical(api_client: TestClient, expdb_test: Connection) -> None: response = api_client.get("/v1/datasets/qualities/list") assert response.status_code == http.client.OK expected = { @@ -120,3 +121,26 @@ def test_list_qualities_identical(api_client: TestClient) -> None: }, } assert expected == response.json() + + deleted = expected["data_qualities_list"]["quality"].pop() + expdb_test.execute( + text( + """ + DELETE FROM data_quality + WHERE `quality`=:deleted_quality + """, + ), + parameters={"deleted_quality": deleted}, + ) + expdb_test.execute( + text( + """ + DELETE FROM quality + WHERE `name`=:deleted_quality + """, + ), + parameters={"deleted_quality": deleted}, + ) + response = api_client.get("/v1/datasets/qualities/list") + assert response.status_code == http.client.OK + assert expected == response.json() From f39dd24b822088f0a36ac72bc678711dab85b6f0 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Fri, 10 Nov 2023 11:34:45 +0100 Subject: [PATCH 4/5] Add test to verify behavior is same as PHP endpoint Ideally we would want to test this against several database states for example by deleting or adding new qualities. However, this will not work without committing to the database. So we either need to spin up a new database and php api for this test to be idempotent or we lose idempotency. Neither seem worth the overhead. --- tests/routers/v1/qualities_test.py | 54 +++++++++++++++++++----------- 1 file changed, 35 insertions(+), 19 deletions(-) diff --git a/tests/routers/v1/qualities_test.py b/tests/routers/v1/qualities_test.py index 49947806..f1a7a605 100644 --- a/tests/routers/v1/qualities_test.py +++ b/tests/routers/v1/qualities_test.py @@ -1,10 +1,42 @@ import http.client +import httpx +import pytest from sqlalchemy import Connection, text from starlette.testclient import TestClient -def test_list_qualities_identical(api_client: TestClient, expdb_test: Connection) -> None: +def _remove_quality_from_database(quality_name: str, expdb_test: Connection) -> None: + expdb_test.execute( + text( + """ + DELETE FROM data_quality + WHERE `quality`=:deleted_quality + """, + ), + parameters={"deleted_quality": quality_name}, + ) + expdb_test.execute( + text( + """ + DELETE FROM quality + WHERE `name`=:deleted_quality + """, + ), + parameters={"deleted_quality": quality_name}, + ) + + +@pytest.mark.php() +def test_list_qualities_identical(api_client: TestClient) -> None: + original = httpx.get("http://server-api-php-api-1:80/api/v1/json/data/qualities/list") + new = api_client.get("/v1/datasets/qualities/list") + assert original.status_code == new.status_code + assert original.json() == new.json() + # To keep the test idempotent, we cannot test if reaction to database changes is identical + + +def test_list_qualities(api_client: TestClient, expdb_test: Connection) -> None: response = api_client.get("/v1/datasets/qualities/list") assert response.status_code == http.client.OK expected = { @@ -123,24 +155,8 @@ def test_list_qualities_identical(api_client: TestClient, expdb_test: Connection assert expected == response.json() deleted = expected["data_qualities_list"]["quality"].pop() - expdb_test.execute( - text( - """ - DELETE FROM data_quality - WHERE `quality`=:deleted_quality - """, - ), - parameters={"deleted_quality": deleted}, - ) - expdb_test.execute( - text( - """ - DELETE FROM quality - WHERE `name`=:deleted_quality - """, - ), - parameters={"deleted_quality": deleted}, - ) + _remove_quality_from_database(quality_name=deleted, expdb_test=expdb_test) + response = api_client.get("/v1/datasets/qualities/list") assert response.status_code == http.client.OK assert expected == response.json() From 882c71f88050a822a92b74ca4d4f06155bac2b90 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Fri, 10 Nov 2023 11:53:25 +0100 Subject: [PATCH 5/5] Add clarification on faithfulness and sketch V1-to-V2 changes --- docs/migration.md | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/docs/migration.md b/docs/migration.md index 46f41e23..c629a141 100644 --- a/docs/migration.md +++ b/docs/migration.md @@ -16,7 +16,11 @@ in which case using the generated REST API documentation is recommended. The first iteration of the new server has nearly identical responses to the old JSON endpoints, but there are exceptions. Most exceptions either bug fixes, or arise from -technical limitations. +technical limitations. This list covers the most important changes, but there may +be some undocumented changes for edge cases. The PHP API was underspecified, and we +decided that reverse engineering the specifications which mostly arise from +implementation details was not worth the effort. If there is a behavioral change which +was not documented but affects you, please [open a bug report](https://github.com/openml/server-api/issues/new?assignees=&labels=bug%2C+triage&projects=&template=bug-report.md&title=). ### All Endpoints The following changes affect all endpoints. @@ -35,7 +39,7 @@ and JSON content will be different. + {"detail":[{"loc":["query","_dataset_id"],"msg":"value is not a valid integer","type":"type_error.integer"}]} ``` -!!! Bug "Input validation has been added to many end points" +!!! warning "Input validation has been added to many end points" There are endpoints which previously did not do any input validation. These endpoints now do enforce stricter input constraints. @@ -75,7 +79,16 @@ Python-V1 will always return JSON. ## V1 to V2 -Most of the changes are focused on standardizing responses. +Most of the changes are focused on standardizing responses, working on: + + * using JSON types. + * removing levels of nesting for endpoints which return single-field JSON. + * always returning lists for fields which may contain multiple values even if it + contains only one element or no element. + * restricting or expanding input types as appropriate. + * standardizing authentication and access messages, and consistently execute those checks + before fetching data or providing error messages about the data. + ### Datasets