diff --git a/docs/migration.md b/docs/migration.md index 46f41e23..c629a141 100644 --- a/docs/migration.md +++ b/docs/migration.md @@ -16,7 +16,11 @@ in which case using the generated REST API documentation is recommended. The first iteration of the new server has nearly identical responses to the old JSON endpoints, but there are exceptions. Most exceptions either bug fixes, or arise from -technical limitations. +technical limitations. This list covers the most important changes, but there may +be some undocumented changes for edge cases. The PHP API was underspecified, and we +decided that reverse engineering the specifications which mostly arise from +implementation details was not worth the effort. If there is a behavioral change which +was not documented but affects you, please [open a bug report](https://github.com/openml/server-api/issues/new?assignees=&labels=bug%2C+triage&projects=&template=bug-report.md&title=). ### All Endpoints The following changes affect all endpoints. @@ -35,7 +39,7 @@ and JSON content will be different. + {"detail":[{"loc":["query","_dataset_id"],"msg":"value is not a valid integer","type":"type_error.integer"}]} ``` -!!! Bug "Input validation has been added to many end points" +!!! warning "Input validation has been added to many end points" There are endpoints which previously did not do any input validation. These endpoints now do enforce stricter input constraints. @@ -75,7 +79,16 @@ Python-V1 will always return JSON. ## V1 to V2 -Most of the changes are focused on standardizing responses. +Most of the changes are focused on standardizing responses, working on: + + * using JSON types. + * removing levels of nesting for endpoints which return single-field JSON. + * always returning lists for fields which may contain multiple values even if it + contains only one element or no element. + * restricting or expanding input types as appropriate. + * standardizing authentication and access messages, and consistently execute those checks + before fetching data or providing error messages about the data. + ### Datasets diff --git a/src/database/datasets.py b/src/database/datasets.py index b3f453ff..62e7df97 100644 --- a/src/database/datasets.py +++ b/src/database/datasets.py @@ -6,6 +6,20 @@ from database.meta import get_column_names +def list_all_qualities(connection: Connection) -> list[str]: + # The current implementation only fetches *used* qualities, otherwise you should + # query: SELECT `name` FROM `quality` WHERE `type`='DataQuality' + qualities = connection.execute( + text( + """ + SELECT DISTINCT(`quality`) + FROM data_quality + """, + ), + ) + return [quality.quality for quality in qualities] + + def get_dataset(dataset_id: int, connection: Connection) -> dict[str, Any] | None: columns = get_column_names(connection, "dataset") row = connection.execute( diff --git a/src/main.py b/src/main.py index 3c5d595c..44f7cefb 100644 --- a/src/main.py +++ b/src/main.py @@ -3,7 +3,8 @@ import uvicorn from fastapi import FastAPI from routers.mldcat_ap.dataset import router as mldcat_ap_router -from routers.v1.datasets import router as datasets_router_old_format +from routers.v1.datasets import router as datasets_router_v1_format +from routers.v1.qualities import router as qualities_router from routers.v2.datasets import router as datasets_router @@ -37,7 +38,8 @@ def create_api() -> FastAPI: app = FastAPI() app.include_router(datasets_router) - app.include_router(datasets_router_old_format) + app.include_router(datasets_router_v1_format) + app.include_router(qualities_router) app.include_router(mldcat_ap_router) return app diff --git a/src/routers/v1/qualities.py b/src/routers/v1/qualities.py new file mode 100644 index 00000000..a00372d8 --- /dev/null +++ b/src/routers/v1/qualities.py @@ -0,0 +1,21 @@ +from typing import Annotated, Literal + +from database.datasets import list_all_qualities +from fastapi import APIRouter, Depends +from sqlalchemy import Connection + +from routers.dependencies import expdb_connection + +router = APIRouter(prefix="/v1/datasets", tags=["datasets"]) + + +@router.get("/qualities/list") +def list_qualities( + expdb: Annotated[Connection, Depends(expdb_connection)], +) -> dict[Literal["data_qualities_list"], dict[Literal["quality"], list[str]]]: + qualities = list_all_qualities(connection=expdb) + return { + "data_qualities_list": { + "quality": qualities, + }, + } diff --git a/tests/routers/v1/qualities_test.py b/tests/routers/v1/qualities_test.py new file mode 100644 index 00000000..f1a7a605 --- /dev/null +++ b/tests/routers/v1/qualities_test.py @@ -0,0 +1,162 @@ +import http.client + +import httpx +import pytest +from sqlalchemy import Connection, text +from starlette.testclient import TestClient + + +def _remove_quality_from_database(quality_name: str, expdb_test: Connection) -> None: + expdb_test.execute( + text( + """ + DELETE FROM data_quality + WHERE `quality`=:deleted_quality + """, + ), + parameters={"deleted_quality": quality_name}, + ) + expdb_test.execute( + text( + """ + DELETE FROM quality + WHERE `name`=:deleted_quality + """, + ), + parameters={"deleted_quality": quality_name}, + ) + + +@pytest.mark.php() +def test_list_qualities_identical(api_client: TestClient) -> None: + original = httpx.get("http://server-api-php-api-1:80/api/v1/json/data/qualities/list") + new = api_client.get("/v1/datasets/qualities/list") + assert original.status_code == new.status_code + assert original.json() == new.json() + # To keep the test idempotent, we cannot test if reaction to database changes is identical + + +def test_list_qualities(api_client: TestClient, expdb_test: Connection) -> None: + response = api_client.get("/v1/datasets/qualities/list") + assert response.status_code == http.client.OK + expected = { + "data_qualities_list": { + "quality": [ + "AutoCorrelation", + "CfsSubsetEval_DecisionStumpAUC", + "CfsSubsetEval_DecisionStumpErrRate", + "CfsSubsetEval_DecisionStumpKappa", + "CfsSubsetEval_NaiveBayesAUC", + "CfsSubsetEval_NaiveBayesErrRate", + "CfsSubsetEval_NaiveBayesKappa", + "CfsSubsetEval_kNN1NAUC", + "CfsSubsetEval_kNN1NErrRate", + "CfsSubsetEval_kNN1NKappa", + "ClassEntropy", + "DecisionStumpAUC", + "DecisionStumpErrRate", + "DecisionStumpKappa", + "Dimensionality", + "EquivalentNumberOfAtts", + "J48.00001.AUC", + "J48.00001.ErrRate", + "J48.00001.Kappa", + "J48.0001.AUC", + "J48.0001.ErrRate", + "J48.0001.Kappa", + "J48.001.AUC", + "J48.001.ErrRate", + "J48.001.Kappa", + "MajorityClassPercentage", + "MajorityClassSize", + "MaxAttributeEntropy", + "MaxKurtosisOfNumericAtts", + "MaxMeansOfNumericAtts", + "MaxMutualInformation", + "MaxNominalAttDistinctValues", + "MaxSkewnessOfNumericAtts", + "MaxStdDevOfNumericAtts", + "MeanAttributeEntropy", + "MeanKurtosisOfNumericAtts", + "MeanMeansOfNumericAtts", + "MeanMutualInformation", + "MeanNoiseToSignalRatio", + "MeanNominalAttDistinctValues", + "MeanSkewnessOfNumericAtts", + "MeanStdDevOfNumericAtts", + "MinAttributeEntropy", + "MinKurtosisOfNumericAtts", + "MinMeansOfNumericAtts", + "MinMutualInformation", + "MinNominalAttDistinctValues", + "MinSkewnessOfNumericAtts", + "MinStdDevOfNumericAtts", + "MinorityClassPercentage", + "MinorityClassSize", + "NaiveBayesAUC", + "NaiveBayesErrRate", + "NaiveBayesKappa", + "NumberOfBinaryFeatures", + "NumberOfClasses", + "NumberOfFeatures", + "NumberOfInstances", + "NumberOfInstancesWithMissingValues", + "NumberOfMissingValues", + "NumberOfNumericFeatures", + "NumberOfSymbolicFeatures", + "PercentageOfBinaryFeatures", + "PercentageOfInstancesWithMissingValues", + "PercentageOfMissingValues", + "PercentageOfNumericFeatures", + "PercentageOfSymbolicFeatures", + "Quartile1AttributeEntropy", + "Quartile1KurtosisOfNumericAtts", + "Quartile1MeansOfNumericAtts", + "Quartile1MutualInformation", + "Quartile1SkewnessOfNumericAtts", + "Quartile1StdDevOfNumericAtts", + "Quartile2AttributeEntropy", + "Quartile2KurtosisOfNumericAtts", + "Quartile2MeansOfNumericAtts", + "Quartile2MutualInformation", + "Quartile2SkewnessOfNumericAtts", + "Quartile2StdDevOfNumericAtts", + "Quartile3AttributeEntropy", + "Quartile3KurtosisOfNumericAtts", + "Quartile3MeansOfNumericAtts", + "Quartile3MutualInformation", + "Quartile3SkewnessOfNumericAtts", + "Quartile3StdDevOfNumericAtts", + "REPTreeDepth1AUC", + "REPTreeDepth1ErrRate", + "REPTreeDepth1Kappa", + "REPTreeDepth2AUC", + "REPTreeDepth2ErrRate", + "REPTreeDepth2Kappa", + "REPTreeDepth3AUC", + "REPTreeDepth3ErrRate", + "REPTreeDepth3Kappa", + "RandomTreeDepth1AUC", + "RandomTreeDepth1ErrRate", + "RandomTreeDepth1Kappa", + "RandomTreeDepth2AUC", + "RandomTreeDepth2ErrRate", + "RandomTreeDepth2Kappa", + "RandomTreeDepth3AUC", + "RandomTreeDepth3ErrRate", + "RandomTreeDepth3Kappa", + "StdvNominalAttDistinctValues", + "kNN1NAUC", + "kNN1NErrRate", + "kNN1NKappa", + ], + }, + } + assert expected == response.json() + + deleted = expected["data_qualities_list"]["quality"].pop() + _remove_quality_from_database(quality_name=deleted, expdb_test=expdb_test) + + response = api_client.get("/v1/datasets/qualities/list") + assert response.status_code == http.client.OK + assert expected == response.json()