From e545660e8fbb70c6d01ebf5a14c305785c9d49f7 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Wed, 29 Nov 2023 14:08:23 +0100 Subject: [PATCH 1/6] Add datasets/features/{id} endpoint --- src/database/datasets.py | 31 ++++++++++++++- src/routers/openml/datasets.py | 46 ++++++++++++++++++----- src/schemas/datasets/openml.py | 17 +++++++++ tests/routers/openml/datasets_test.py | 54 +++++++++++++++++++++++++++ 4 files changed, 138 insertions(+), 10 deletions(-) diff --git a/src/database/datasets.py b/src/database/datasets.py index ffabb7b3..02360b8f 100644 --- a/src/database/datasets.py +++ b/src/database/datasets.py @@ -2,7 +2,7 @@ from collections import defaultdict from typing import Any, Iterable -from schemas.datasets.openml import Quality +from schemas.datasets.openml import Feature, Quality from sqlalchemy import Connection, text from database.meta import get_column_names @@ -172,3 +172,32 @@ def get_latest_processing_update(dataset_id: int, connection: Connection) -> dic return ( dict(zip(columns, result[0], strict=True), strict=True) if (result := list(row)) else None ) + + +def get_features_for_dataset(dataset_id: int, connection: Connection) -> list[Feature]: + rows = connection.execute( + text( + """ + SELECT `index`,`name`,`data_type`,`is_target`, + `is_row_identifier`,`is_ignore`,`NumberOfMissingValues` as `number_of_missing_values` + FROM data_feature + WHERE `did` = :dataset_id + """, + ), + parameters={"dataset_id": dataset_id}, + ) + return [Feature(**row, nominal_values=None) for row in rows.mappings()] + + +def get_feature_values(dataset_id: int, feature_index: int, connection: Connection) -> list[str]: + rows = connection.execute( + text( + """ + SELECT `value` + FROM data_feature_value + WHERE `did` = :dataset_id AND `index` = :feature_index + """, + ), + parameters={"dataset_id": dataset_id, "feature_index": feature_index}, + ) + return [row.value for row in rows] diff --git a/src/routers/openml/datasets.py b/src/routers/openml/datasets.py index 08db9a13..d707148a 100644 --- a/src/routers/openml/datasets.py +++ b/src/routers/openml/datasets.py @@ -19,6 +19,8 @@ ) from database.datasets import get_dataset as db_get_dataset from database.datasets import ( + get_feature_values, + get_features_for_dataset, get_file, get_latest_dataset_description, get_latest_processing_update, @@ -29,7 +31,7 @@ from database.datasets import tag_dataset as db_tag_dataset from database.users import User, UserGroup from fastapi import APIRouter, Body, Depends, HTTPException -from schemas.datasets.openml import DatasetMetadata, DatasetStatus +from schemas.datasets.openml import DatasetMetadata, DatasetStatus, Feature, FeatureType from sqlalchemy import Connection, text from routers.dependencies import Pagination, expdb_connection, fetch_user, userdb_connection @@ -263,6 +265,39 @@ def _get_processing_information(dataset_id: int, connection: Connection) -> Proc return ProcessingInformation(date=date_processed, warning=warning, error=error) +def _get_dataset_raise_otherwise( + dataset_id: int, + user: User | None, + expdb: Connection, +) -> dict[str, Any]: + """Fetches the dataset from the database if it exists and the user has permissions. + + Raises HTTPException if the dataset does not exist or the user can not access it. + """ + if not (dataset := db_get_dataset(dataset_id, expdb)): + error = _format_error(code=DatasetError.NOT_FOUND, message="Unknown dataset") + raise HTTPException(status_code=http.client.NOT_FOUND, detail=error) + + if not _user_has_access(dataset=dataset, user=user): + error = _format_error(code=DatasetError.NO_ACCESS, message="No access granted") + raise HTTPException(status_code=http.client.FORBIDDEN, detail=error) + + return dataset + + +@router.get("/features/{dataset_id}", response_model_exclude_none=True) +def get_dataset_features( + dataset_id: int, + user: Annotated[User | None, Depends(fetch_user)] = None, + expdb: Annotated[Connection, Depends(expdb_connection)] = None, +) -> list[Feature]: + _get_dataset_raise_otherwise(dataset_id, user, expdb) + features = get_features_for_dataset(dataset_id, expdb) + for feature in [f for f in features if f.data_type == FeatureType.NOMINAL]: + feature.nominal_values = get_feature_values(dataset_id, feature.index, expdb) + return features + + @router.get( path="/{dataset_id}", description="Get meta-data for dataset with ID `dataset_id`.", @@ -273,14 +308,7 @@ def get_dataset( user_db: Annotated[Connection, Depends(userdb_connection)] = None, expdb_db: Annotated[Connection, Depends(expdb_connection)] = None, ) -> DatasetMetadata: - if not (dataset := db_get_dataset(dataset_id, expdb_db)): - error = _format_error(code=DatasetError.NOT_FOUND, message="Unknown dataset") - raise HTTPException(status_code=http.client.NOT_FOUND, detail=error) - - if not _user_has_access(dataset=dataset, user=user): - error = _format_error(code=DatasetError.NO_ACCESS, message="No access granted") - raise HTTPException(status_code=http.client.FORBIDDEN, detail=error) - + dataset = _get_dataset_raise_otherwise(dataset_id, user, expdb_db) if not (dataset_file := get_file(dataset["file_id"], user_db)): error = _format_error( code=DatasetError.NO_DATA_FILE, diff --git a/src/schemas/datasets/openml.py b/src/schemas/datasets/openml.py index e1360006..3550bdde 100644 --- a/src/schemas/datasets/openml.py +++ b/src/schemas/datasets/openml.py @@ -29,6 +29,23 @@ class Quality(BaseModel): value: float | None +class FeatureType(StrEnum): + NUMERIC = "numeric" + NOMINAL = "nominal" + STRING = "string" + + +class Feature(BaseModel): + index: int + name: str + data_type: FeatureType + is_target: bool + is_ignore: bool + is_row_identifier: bool + number_of_missing_values: int + nominal_values: list[str] | None + + class DatasetMetadata(BaseModel): id_: int = Field(json_schema_extra={"example": 1}, alias="id") visibility: Visibility = Field(json_schema_extra={"example": Visibility.PUBLIC}) diff --git a/tests/routers/openml/datasets_test.py b/tests/routers/openml/datasets_test.py index 98a11941..37829c19 100644 --- a/tests/routers/openml/datasets_test.py +++ b/tests/routers/openml/datasets_test.py @@ -58,3 +58,57 @@ def test_private_dataset_owner_access( def test_private_dataset_admin_access(py_api: TestClient) -> None: cast(httpx.Response, py_api.get("/v2/datasets/130?api_key=...")) # test against cached response + + +def test_dataset_features(py_api: TestClient) -> None: + # Dataset 4 has both nominal and numerical features, so provides reasonable coverage + response = py_api.get("/datasets/features/4") + assert response.status_code == http.client.OK + assert response.json() == [ + { + "index": 0, + "name": "left-weight", + "data_type": "numeric", + "is_target": False, + "is_ignore": False, + "is_row_identifier": False, + "number_of_missing_values": 0, + }, + { + "index": 1, + "name": "left-distance", + "data_type": "numeric", + "is_target": False, + "is_ignore": False, + "is_row_identifier": False, + "number_of_missing_values": 0, + }, + { + "index": 2, + "name": "right-weight", + "data_type": "numeric", + "is_target": False, + "is_ignore": False, + "is_row_identifier": False, + "number_of_missing_values": 0, + }, + { + "index": 3, + "name": "right-distance", + "data_type": "numeric", + "is_target": False, + "is_ignore": False, + "is_row_identifier": False, + "number_of_missing_values": 0, + }, + { + "index": 4, + "name": "class", + "data_type": "nominal", + "nominal_values": ["B", "L", "R"], + "is_target": True, + "is_ignore": False, + "is_row_identifier": False, + "number_of_missing_values": 0, + }, + ] From 31632caecc90fa03f366bed3a11d21a93346de1d Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Wed, 29 Nov 2023 16:43:15 +0100 Subject: [PATCH 2/6] Add integration test for datasets/features/id --- .../migration/datasets_migration_test.py | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/tests/routers/openml/migration/datasets_migration_test.py b/tests/routers/openml/migration/datasets_migration_test.py index 6d93827c..19ab7785 100644 --- a/tests/routers/openml/migration/datasets_migration_test.py +++ b/tests/routers/openml/migration/datasets_migration_test.py @@ -190,3 +190,32 @@ def test_dataset_tag_response_is_identical( original = original.json() new = new.json() assert original == new + + +@pytest.mark.php() +@pytest.mark.parametrize( + "data_id", + list(range(1, 130)), +) +def test_datasets_feature_is_identical( + data_id: int, + py_api: TestClient, + php_api: httpx.Client, +) -> None: + if data_id in [55, 56, 59]: + pytest.skip('Error message for "274: No features found." not implemented') + response = py_api.get(f"/datasets/features/{data_id}") + original = php_api.get(f"/data/features/{data_id}") + assert response.status_code == original.status_code + python_body = response.json() + for feature in python_body: + for key, value in list(feature.items()): + if key == "nominal_values": + # The old API uses `nominal_value` instead of `nominal_values` + values = feature.pop(key) + # The old API returns a str if there is only a single element + feature["nominal_value"] = values if len(values) > 1 else values[0] + else: + # The old API formats bool as string in lower-case + feature[key] = str(value) if not isinstance(value, bool) else str(value).lower() + assert python_body == original.json()["data_features"]["feature"] From f5efd96326a269e7e34c6ca62e344a0cd9b2d1c2 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Wed, 29 Nov 2023 17:06:22 +0100 Subject: [PATCH 3/6] Add return detailed error codes when features are not found --- src/routers/openml/datasets.py | 19 +++++++++++++++++++ .../migration/datasets_migration_test.py | 9 +++++++-- 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/src/routers/openml/datasets.py b/src/routers/openml/datasets.py index d707148a..b10a8417 100644 --- a/src/routers/openml/datasets.py +++ b/src/routers/openml/datasets.py @@ -295,6 +295,25 @@ def get_dataset_features( features = get_features_for_dataset(dataset_id, expdb) for feature in [f for f in features if f.data_type == FeatureType.NOMINAL]: feature.nominal_values = get_feature_values(dataset_id, feature.index, expdb) + + if not features: + processing_state = get_latest_processing_update(dataset_id, expdb) + if processing_state is None: + code, msg = ( + 273, + "Dataset not processed yet. The dataset was not processed yet, features are not yet available. Please wait for a few minutes.", # noqa: E501 + ) + elif processing_state.get("error"): + code, msg = 274, "No features found. Additionally, dataset processed with error" + else: + code, msg = ( + 272, + "No features found. The dataset did not contain any features, or we could not extract them.", # noqa: E501 + ) + raise HTTPException( + status_code=http.client.PRECONDITION_FAILED, + detail={"code": code, "message": msg}, + ) return features diff --git a/tests/routers/openml/migration/datasets_migration_test.py b/tests/routers/openml/migration/datasets_migration_test.py index 19ab7785..d7460dcc 100644 --- a/tests/routers/openml/migration/datasets_migration_test.py +++ b/tests/routers/openml/migration/datasets_migration_test.py @@ -202,11 +202,16 @@ def test_datasets_feature_is_identical( py_api: TestClient, php_api: httpx.Client, ) -> None: - if data_id in [55, 56, 59]: - pytest.skip('Error message for "274: No features found." not implemented') response = py_api.get(f"/datasets/features/{data_id}") original = php_api.get(f"/data/features/{data_id}") assert response.status_code == original.status_code + + if response.status_code != http.client.OK: + error = response.json()["detail"] + error["code"] = str(error["code"]) + assert error == original.json()["error"] + return + python_body = response.json() for feature in python_body: for key, value in list(feature.items()): From df8a2c694ac7e333dbf2ab50f672a8fe92e98439 Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Thu, 30 Nov 2023 10:30:13 +0100 Subject: [PATCH 4/6] Add access tests for datasets/features/{id} endpoint --- tests/routers/openml/datasets_test.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tests/routers/openml/datasets_test.py b/tests/routers/openml/datasets_test.py index 37829c19..1fa2a619 100644 --- a/tests/routers/openml/datasets_test.py +++ b/tests/routers/openml/datasets_test.py @@ -5,6 +5,8 @@ import pytest from starlette.testclient import TestClient +from tests.conftest import ApiKey + @pytest.mark.parametrize( ("dataset_id", "response_code"), @@ -112,3 +114,17 @@ def test_dataset_features(py_api: TestClient) -> None: "number_of_missing_values": 0, }, ] + + +def test_dataset_features_no_access(py_api: TestClient) -> None: + response = py_api.get("/datasets/features/130") + assert response.status_code == http.client.FORBIDDEN + + +@pytest.mark.parametrize( + "api_key", + [ApiKey.ADMIN, ApiKey.OWNER_USER], +) +def test_dataset_features_access_to_private(api_key: ApiKey, py_api: TestClient) -> None: + response = py_api.get(f"/datasets/features/130?api_key={api_key}") + assert response.status_code == http.client.OK From cd534f8c17461587c7858bbefffc095831db730b Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Thu, 30 Nov 2023 10:36:51 +0100 Subject: [PATCH 5/6] Test datasets/features/{id} for dataset with error in processing --- tests/routers/openml/datasets_test.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/routers/openml/datasets_test.py b/tests/routers/openml/datasets_test.py index 1fa2a619..04376f02 100644 --- a/tests/routers/openml/datasets_test.py +++ b/tests/routers/openml/datasets_test.py @@ -128,3 +128,14 @@ def test_dataset_features_no_access(py_api: TestClient) -> None: def test_dataset_features_access_to_private(api_key: ApiKey, py_api: TestClient) -> None: response = py_api.get(f"/datasets/features/130?api_key={api_key}") assert response.status_code == http.client.OK + + +def test_dataset_features_with_processing_error(py_api: TestClient) -> None: + # When a dataset is processed to extract its feature metadata, errors may occur. + # In that case, no feature information will ever be available. + response = py_api.get("/datasets/features/55") + assert response.status_code == http.client.PRECONDITION_FAILED + assert response.json()["detail"] == { + "code": 274, + "message": "No features found. Additionally, dataset processed with error", + } From ab4f72d8417e7ea13b459efa6b31110a1408e12f Mon Sep 17 00:00:00 2001 From: PGijsbers Date: Thu, 30 Nov 2023 10:39:14 +0100 Subject: [PATCH 6/6] Add datasets/features/{id} for when dataset does not exist --- tests/routers/openml/datasets_test.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/routers/openml/datasets_test.py b/tests/routers/openml/datasets_test.py index 04376f02..52a8e711 100644 --- a/tests/routers/openml/datasets_test.py +++ b/tests/routers/openml/datasets_test.py @@ -139,3 +139,8 @@ def test_dataset_features_with_processing_error(py_api: TestClient) -> None: "code": 274, "message": "No features found. Additionally, dataset processed with error", } + + +def test_dataset_features_dataset_does_not_exist(py_api: TestClient) -> None: + resource = py_api.get("/datasets/features/1000") + assert resource.status_code == http.client.NOT_FOUND