diff --git a/src/database/datasets.py b/src/database/datasets.py index f69a035a..e1626e74 100644 --- a/src/database/datasets.py +++ b/src/database/datasets.py @@ -131,6 +131,25 @@ def get_features(dataset_id: int, connection: Connection) -> list[Feature]: return [Feature(**row, nominal_values=None) for row in rows.mappings()] +def get_feature_ontologies(dataset_id: int, connection: Connection) -> dict[int, list[str]]: + """Return a mapping from feature index to its list of ontology URIs.""" + rows = connection.execute( + text( + """ + SELECT `index`, `value` + FROM data_feature_description + WHERE `did` = :dataset_id AND `description_type` = 'ontology' + ORDER BY `index`, `value` + """, + ), + parameters={"dataset_id": dataset_id}, + ) + ontologies: dict[int, list[str]] = {} + for row in rows.mappings(): + ontologies.setdefault(row["index"], []).append(row["value"]) + return ontologies + + def get_feature_values(dataset_id: int, *, feature_index: int, connection: Connection) -> list[str]: rows = connection.execute( text( diff --git a/src/routers/openml/datasets.py b/src/routers/openml/datasets.py index dda25117..df7c79c2 100644 --- a/src/routers/openml/datasets.py +++ b/src/routers/openml/datasets.py @@ -293,6 +293,9 @@ def get_dataset_features( feature_index=feature.index, connection=expdb, ) + ontologies = database.datasets.get_feature_ontologies(dataset_id, expdb) + for feature in features: + feature.ontology = ontologies.get(feature.index) if not features: processing_state = database.datasets.get_latest_processing_update(dataset_id, expdb) diff --git a/src/schemas/datasets/openml.py b/src/schemas/datasets/openml.py index 8edb373c..997055b2 100644 --- a/src/schemas/datasets/openml.py +++ b/src/schemas/datasets/openml.py @@ -45,6 +45,7 @@ class Feature(BaseModel): is_row_identifier: bool number_of_missing_values: int nominal_values: list[str] | None + ontology: list[str] | None = None class EstimationProcedure(BaseModel): diff --git a/tests/routers/openml/datasets_test.py b/tests/routers/openml/datasets_test.py index 769e334a..8c596e65 100644 --- a/tests/routers/openml/datasets_test.py +++ b/tests/routers/openml/datasets_test.py @@ -158,6 +158,24 @@ def test_dataset_features(py_api: TestClient) -> None: ] +def test_dataset_features_with_ontology(py_api: TestClient) -> None: + # Dataset 11 has ontology data for features 1, 2, and 3 + response = py_api.get("/datasets/features/11") + assert response.status_code == HTTPStatus.OK + features = {f["index"]: f for f in response.json()} + assert features[1]["ontology"] == ["https://en.wikipedia.org/wiki/Service_(motor_vehicle)"] + assert features[2]["ontology"] == [ + "https://en.wikipedia.org/wiki/Car_door", + "https://en.wikipedia.org/wiki/Door", + ] + assert features[3]["ontology"] == [ + "https://en.wikipedia.org/wiki/Passenger_vehicles_in_the_United_States" + ] + # Features without ontology should not include the field + assert "ontology" not in features[0] + assert "ontology" not in features[4] + + def test_dataset_features_no_access(py_api: TestClient) -> None: response = py_api.get("/datasets/features/130") assert response.status_code == HTTPStatus.FORBIDDEN diff --git a/tests/routers/openml/migration/datasets_migration_test.py b/tests/routers/openml/migration/datasets_migration_test.py index 011d8dba..ddf60daf 100644 --- a/tests/routers/openml/migration/datasets_migration_test.py +++ b/tests/routers/openml/migration/datasets_migration_test.py @@ -222,10 +222,12 @@ def test_datasets_feature_is_identical( values = feature.pop(key) # The old API returns a str if there is only a single element feature["nominal_value"] = values if len(values) > 1 else values[0] + elif key == "ontology": + # The old API returns a str if there is only a single element + values = feature.pop(key) + feature["ontology"] = values if len(values) > 1 else values[0] else: # The old API formats bool as string in lower-case feature[key] = str(value) if not isinstance(value, bool) else str(value).lower() original_features = original.json()["data_features"]["feature"] - for feature in original_features: - feature.pop("ontology", None) assert python_body == original_features