From e545660e8fbb70c6d01ebf5a14c305785c9d49f7 Mon Sep 17 00:00:00 2001
From: PGijsbers
Date: Wed, 29 Nov 2023 14:08:23 +0100
Subject: [PATCH 1/6] Add datasets/features/{id} endpoint
---
src/database/datasets.py | 31 ++++++++++++++-
src/routers/openml/datasets.py | 46 ++++++++++++++++++-----
src/schemas/datasets/openml.py | 17 +++++++++
tests/routers/openml/datasets_test.py | 54 +++++++++++++++++++++++++++
4 files changed, 138 insertions(+), 10 deletions(-)
diff --git a/src/database/datasets.py b/src/database/datasets.py
index ffabb7b3..02360b8f 100644
--- a/src/database/datasets.py
+++ b/src/database/datasets.py
@@ -2,7 +2,7 @@
from collections import defaultdict
from typing import Any, Iterable
-from schemas.datasets.openml import Quality
+from schemas.datasets.openml import Feature, Quality
from sqlalchemy import Connection, text
from database.meta import get_column_names
@@ -172,3 +172,32 @@ def get_latest_processing_update(dataset_id: int, connection: Connection) -> dic
return (
dict(zip(columns, result[0], strict=True), strict=True) if (result := list(row)) else None
)
+
+
+def get_features_for_dataset(dataset_id: int, connection: Connection) -> list[Feature]:
+ rows = connection.execute(
+ text(
+ """
+ SELECT `index`,`name`,`data_type`,`is_target`,
+ `is_row_identifier`,`is_ignore`,`NumberOfMissingValues` as `number_of_missing_values`
+ FROM data_feature
+ WHERE `did` = :dataset_id
+ """,
+ ),
+ parameters={"dataset_id": dataset_id},
+ )
+ return [Feature(**row, nominal_values=None) for row in rows.mappings()]
+
+
+def get_feature_values(dataset_id: int, feature_index: int, connection: Connection) -> list[str]:
+ rows = connection.execute(
+ text(
+ """
+ SELECT `value`
+ FROM data_feature_value
+ WHERE `did` = :dataset_id AND `index` = :feature_index
+ """,
+ ),
+ parameters={"dataset_id": dataset_id, "feature_index": feature_index},
+ )
+ return [row.value for row in rows]
diff --git a/src/routers/openml/datasets.py b/src/routers/openml/datasets.py
index 08db9a13..d707148a 100644
--- a/src/routers/openml/datasets.py
+++ b/src/routers/openml/datasets.py
@@ -19,6 +19,8 @@
)
from database.datasets import get_dataset as db_get_dataset
from database.datasets import (
+ get_feature_values,
+ get_features_for_dataset,
get_file,
get_latest_dataset_description,
get_latest_processing_update,
@@ -29,7 +31,7 @@
from database.datasets import tag_dataset as db_tag_dataset
from database.users import User, UserGroup
from fastapi import APIRouter, Body, Depends, HTTPException
-from schemas.datasets.openml import DatasetMetadata, DatasetStatus
+from schemas.datasets.openml import DatasetMetadata, DatasetStatus, Feature, FeatureType
from sqlalchemy import Connection, text
from routers.dependencies import Pagination, expdb_connection, fetch_user, userdb_connection
@@ -263,6 +265,39 @@ def _get_processing_information(dataset_id: int, connection: Connection) -> Proc
return ProcessingInformation(date=date_processed, warning=warning, error=error)
+def _get_dataset_raise_otherwise(
+ dataset_id: int,
+ user: User | None,
+ expdb: Connection,
+) -> dict[str, Any]:
+ """Fetches the dataset from the database if it exists and the user has permissions.
+
+ Raises HTTPException if the dataset does not exist or the user can not access it.
+ """
+ if not (dataset := db_get_dataset(dataset_id, expdb)):
+ error = _format_error(code=DatasetError.NOT_FOUND, message="Unknown dataset")
+ raise HTTPException(status_code=http.client.NOT_FOUND, detail=error)
+
+ if not _user_has_access(dataset=dataset, user=user):
+ error = _format_error(code=DatasetError.NO_ACCESS, message="No access granted")
+ raise HTTPException(status_code=http.client.FORBIDDEN, detail=error)
+
+ return dataset
+
+
+@router.get("/features/{dataset_id}", response_model_exclude_none=True)
+def get_dataset_features(
+ dataset_id: int,
+ user: Annotated[User | None, Depends(fetch_user)] = None,
+ expdb: Annotated[Connection, Depends(expdb_connection)] = None,
+) -> list[Feature]:
+ _get_dataset_raise_otherwise(dataset_id, user, expdb)
+ features = get_features_for_dataset(dataset_id, expdb)
+ for feature in [f for f in features if f.data_type == FeatureType.NOMINAL]:
+ feature.nominal_values = get_feature_values(dataset_id, feature.index, expdb)
+ return features
+
+
@router.get(
path="/{dataset_id}",
description="Get meta-data for dataset with ID `dataset_id`.",
@@ -273,14 +308,7 @@ def get_dataset(
user_db: Annotated[Connection, Depends(userdb_connection)] = None,
expdb_db: Annotated[Connection, Depends(expdb_connection)] = None,
) -> DatasetMetadata:
- if not (dataset := db_get_dataset(dataset_id, expdb_db)):
- error = _format_error(code=DatasetError.NOT_FOUND, message="Unknown dataset")
- raise HTTPException(status_code=http.client.NOT_FOUND, detail=error)
-
- if not _user_has_access(dataset=dataset, user=user):
- error = _format_error(code=DatasetError.NO_ACCESS, message="No access granted")
- raise HTTPException(status_code=http.client.FORBIDDEN, detail=error)
-
+ dataset = _get_dataset_raise_otherwise(dataset_id, user, expdb_db)
if not (dataset_file := get_file(dataset["file_id"], user_db)):
error = _format_error(
code=DatasetError.NO_DATA_FILE,
diff --git a/src/schemas/datasets/openml.py b/src/schemas/datasets/openml.py
index e1360006..3550bdde 100644
--- a/src/schemas/datasets/openml.py
+++ b/src/schemas/datasets/openml.py
@@ -29,6 +29,23 @@ class Quality(BaseModel):
value: float | None
+class FeatureType(StrEnum):
+ NUMERIC = "numeric"
+ NOMINAL = "nominal"
+ STRING = "string"
+
+
+class Feature(BaseModel):
+ index: int
+ name: str
+ data_type: FeatureType
+ is_target: bool
+ is_ignore: bool
+ is_row_identifier: bool
+ number_of_missing_values: int
+ nominal_values: list[str] | None
+
+
class DatasetMetadata(BaseModel):
id_: int = Field(json_schema_extra={"example": 1}, alias="id")
visibility: Visibility = Field(json_schema_extra={"example": Visibility.PUBLIC})
diff --git a/tests/routers/openml/datasets_test.py b/tests/routers/openml/datasets_test.py
index 98a11941..37829c19 100644
--- a/tests/routers/openml/datasets_test.py
+++ b/tests/routers/openml/datasets_test.py
@@ -58,3 +58,57 @@ def test_private_dataset_owner_access(
def test_private_dataset_admin_access(py_api: TestClient) -> None:
cast(httpx.Response, py_api.get("/v2/datasets/130?api_key=..."))
# test against cached response
+
+
+def test_dataset_features(py_api: TestClient) -> None:
+ # Dataset 4 has both nominal and numerical features, so provides reasonable coverage
+ response = py_api.get("/datasets/features/4")
+ assert response.status_code == http.client.OK
+ assert response.json() == [
+ {
+ "index": 0,
+ "name": "left-weight",
+ "data_type": "numeric",
+ "is_target": False,
+ "is_ignore": False,
+ "is_row_identifier": False,
+ "number_of_missing_values": 0,
+ },
+ {
+ "index": 1,
+ "name": "left-distance",
+ "data_type": "numeric",
+ "is_target": False,
+ "is_ignore": False,
+ "is_row_identifier": False,
+ "number_of_missing_values": 0,
+ },
+ {
+ "index": 2,
+ "name": "right-weight",
+ "data_type": "numeric",
+ "is_target": False,
+ "is_ignore": False,
+ "is_row_identifier": False,
+ "number_of_missing_values": 0,
+ },
+ {
+ "index": 3,
+ "name": "right-distance",
+ "data_type": "numeric",
+ "is_target": False,
+ "is_ignore": False,
+ "is_row_identifier": False,
+ "number_of_missing_values": 0,
+ },
+ {
+ "index": 4,
+ "name": "class",
+ "data_type": "nominal",
+ "nominal_values": ["B", "L", "R"],
+ "is_target": True,
+ "is_ignore": False,
+ "is_row_identifier": False,
+ "number_of_missing_values": 0,
+ },
+ ]
From 31632caecc90fa03f366bed3a11d21a93346de1d Mon Sep 17 00:00:00 2001
From: PGijsbers
Date: Wed, 29 Nov 2023 16:43:15 +0100
Subject: [PATCH 2/6] Add integration test for datasets/features/id
---
.../migration/datasets_migration_test.py | 29 +++++++++++++++++++
1 file changed, 29 insertions(+)
diff --git a/tests/routers/openml/migration/datasets_migration_test.py b/tests/routers/openml/migration/datasets_migration_test.py
index 6d93827c..19ab7785 100644
--- a/tests/routers/openml/migration/datasets_migration_test.py
+++ b/tests/routers/openml/migration/datasets_migration_test.py
@@ -190,3 +190,32 @@ def test_dataset_tag_response_is_identical(
original = original.json()
new = new.json()
assert original == new
+
+
+@pytest.mark.php()
+@pytest.mark.parametrize(
+ "data_id",
+ list(range(1, 130)),
+)
+def test_datasets_feature_is_identical(
+ data_id: int,
+ py_api: TestClient,
+ php_api: httpx.Client,
+) -> None:
+ if data_id in [55, 56, 59]:
+ pytest.skip('Error message for "274: No features found." not implemented')
+ response = py_api.get(f"/datasets/features/{data_id}")
+ original = php_api.get(f"/data/features/{data_id}")
+ assert response.status_code == original.status_code
+ python_body = response.json()
+ for feature in python_body:
+ for key, value in list(feature.items()):
+ if key == "nominal_values":
+ # The old API uses `nominal_value` instead of `nominal_values`
+ values = feature.pop(key)
+ # The old API returns a str if there is only a single element
+ feature["nominal_value"] = values if len(values) > 1 else values[0]
+ else:
+ # The old API formats bool as string in lower-case
+ feature[key] = str(value) if not isinstance(value, bool) else str(value).lower()
+ assert python_body == original.json()["data_features"]["feature"]
From f5efd96326a269e7e34c6ca62e344a0cd9b2d1c2 Mon Sep 17 00:00:00 2001
From: PGijsbers
Date: Wed, 29 Nov 2023 17:06:22 +0100
Subject: [PATCH 3/6] Add return detailed error codes when features are not
found
---
src/routers/openml/datasets.py | 19 +++++++++++++++++++
.../migration/datasets_migration_test.py | 9 +++++++--
2 files changed, 26 insertions(+), 2 deletions(-)
diff --git a/src/routers/openml/datasets.py b/src/routers/openml/datasets.py
index d707148a..b10a8417 100644
--- a/src/routers/openml/datasets.py
+++ b/src/routers/openml/datasets.py
@@ -295,6 +295,25 @@ def get_dataset_features(
features = get_features_for_dataset(dataset_id, expdb)
for feature in [f for f in features if f.data_type == FeatureType.NOMINAL]:
feature.nominal_values = get_feature_values(dataset_id, feature.index, expdb)
+
+ if not features:
+ processing_state = get_latest_processing_update(dataset_id, expdb)
+ if processing_state is None:
+ code, msg = (
+ 273,
+ "Dataset not processed yet. The dataset was not processed yet, features are not yet available. Please wait for a few minutes.", # noqa: E501
+ )
+ elif processing_state.get("error"):
+ code, msg = 274, "No features found. Additionally, dataset processed with error"
+ else:
+ code, msg = (
+ 272,
+ "No features found. The dataset did not contain any features, or we could not extract them.", # noqa: E501
+ )
+ raise HTTPException(
+ status_code=http.client.PRECONDITION_FAILED,
+ detail={"code": code, "message": msg},
+ )
return features
diff --git a/tests/routers/openml/migration/datasets_migration_test.py b/tests/routers/openml/migration/datasets_migration_test.py
index 19ab7785..d7460dcc 100644
--- a/tests/routers/openml/migration/datasets_migration_test.py
+++ b/tests/routers/openml/migration/datasets_migration_test.py
@@ -202,11 +202,16 @@ def test_datasets_feature_is_identical(
py_api: TestClient,
php_api: httpx.Client,
) -> None:
- if data_id in [55, 56, 59]:
- pytest.skip('Error message for "274: No features found." not implemented')
response = py_api.get(f"/datasets/features/{data_id}")
original = php_api.get(f"/data/features/{data_id}")
assert response.status_code == original.status_code
+
+ if response.status_code != http.client.OK:
+ error = response.json()["detail"]
+ error["code"] = str(error["code"])
+ assert error == original.json()["error"]
+ return
+
python_body = response.json()
for feature in python_body:
for key, value in list(feature.items()):
From df8a2c694ac7e333dbf2ab50f672a8fe92e98439 Mon Sep 17 00:00:00 2001
From: PGijsbers
Date: Thu, 30 Nov 2023 10:30:13 +0100
Subject: [PATCH 4/6] Add access tests for datasets/features/{id} endpoint
---
tests/routers/openml/datasets_test.py | 16 ++++++++++++++++
1 file changed, 16 insertions(+)
diff --git a/tests/routers/openml/datasets_test.py b/tests/routers/openml/datasets_test.py
index 37829c19..1fa2a619 100644
--- a/tests/routers/openml/datasets_test.py
+++ b/tests/routers/openml/datasets_test.py
@@ -5,6 +5,8 @@
import pytest
from starlette.testclient import TestClient
+from tests.conftest import ApiKey
+
@pytest.mark.parametrize(
("dataset_id", "response_code"),
@@ -112,3 +114,17 @@ def test_dataset_features(py_api: TestClient) -> None:
"number_of_missing_values": 0,
},
]
+
+
+def test_dataset_features_no_access(py_api: TestClient) -> None:
+ response = py_api.get("/datasets/features/130")
+ assert response.status_code == http.client.FORBIDDEN
+
+
+@pytest.mark.parametrize(
+ "api_key",
+ [ApiKey.ADMIN, ApiKey.OWNER_USER],
+)
+def test_dataset_features_access_to_private(api_key: ApiKey, py_api: TestClient) -> None:
+ response = py_api.get(f"/datasets/features/130?api_key={api_key}")
+ assert response.status_code == http.client.OK
From cd534f8c17461587c7858bbefffc095831db730b Mon Sep 17 00:00:00 2001
From: PGijsbers
Date: Thu, 30 Nov 2023 10:36:51 +0100
Subject: [PATCH 5/6] Test datasets/features/{id} for dataset with error in
processing
---
tests/routers/openml/datasets_test.py | 11 +++++++++++
1 file changed, 11 insertions(+)
diff --git a/tests/routers/openml/datasets_test.py b/tests/routers/openml/datasets_test.py
index 1fa2a619..04376f02 100644
--- a/tests/routers/openml/datasets_test.py
+++ b/tests/routers/openml/datasets_test.py
@@ -128,3 +128,14 @@ def test_dataset_features_no_access(py_api: TestClient) -> None:
def test_dataset_features_access_to_private(api_key: ApiKey, py_api: TestClient) -> None:
response = py_api.get(f"/datasets/features/130?api_key={api_key}")
assert response.status_code == http.client.OK
+
+
+def test_dataset_features_with_processing_error(py_api: TestClient) -> None:
+ # When a dataset is processed to extract its feature metadata, errors may occur.
+ # In that case, no feature information will ever be available.
+ response = py_api.get("/datasets/features/55")
+ assert response.status_code == http.client.PRECONDITION_FAILED
+ assert response.json()["detail"] == {
+ "code": 274,
+ "message": "No features found. Additionally, dataset processed with error",
+ }
From ab4f72d8417e7ea13b459efa6b31110a1408e12f Mon Sep 17 00:00:00 2001
From: PGijsbers
Date: Thu, 30 Nov 2023 10:39:14 +0100
Subject: [PATCH 6/6] Add datasets/features/{id} for when dataset does not
exist
---
tests/routers/openml/datasets_test.py | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/tests/routers/openml/datasets_test.py b/tests/routers/openml/datasets_test.py
index 04376f02..52a8e711 100644
--- a/tests/routers/openml/datasets_test.py
+++ b/tests/routers/openml/datasets_test.py
@@ -139,3 +139,8 @@ def test_dataset_features_with_processing_error(py_api: TestClient) -> None:
"code": 274,
"message": "No features found. Additionally, dataset processed with error",
}
+
+
+def test_dataset_features_dataset_does_not_exist(py_api: TestClient) -> None:
+ resource = py_api.get("/datasets/features/1000")
+ assert resource.status_code == http.client.NOT_FOUND