diff --git a/src/database/datasets.py b/src/database/datasets.py index f011a651..f18b21e7 100644 --- a/src/database/datasets.py +++ b/src/database/datasets.py @@ -131,6 +131,23 @@ def get_features(dataset_id: int, connection: Connection) -> list[Feature]: return [Feature(**row, nominal_values=None) for row in rows.mappings()] +def get_feature_ontologies(dataset_id: int, connection: Connection) -> dict[int, list[str]]: + rows = connection.execute( + text( + """ + SELECT `index`, `value` + FROM data_feature_description + WHERE `did` = :dataset_id AND `description_type` = 'ontology' + """, + ), + parameters={"dataset_id": dataset_id}, + ) + ontologies: dict[int, list[str]] = {} + for row in rows.mappings(): + ontologies.setdefault(row["index"], []).append(row["value"]) + return ontologies + + def get_feature_values(dataset_id: int, *, feature_index: int, connection: Connection) -> list[str]: rows = connection.execute( text( diff --git a/src/routers/openml/datasets.py b/src/routers/openml/datasets.py index dda25117..1072296b 100644 --- a/src/routers/openml/datasets.py +++ b/src/routers/openml/datasets.py @@ -287,6 +287,12 @@ def get_dataset_features( ) -> list[Feature]: _get_dataset_raise_otherwise(dataset_id, user, expdb) features = database.datasets.get_features(dataset_id, expdb) + + # Attach ontologies from data_feature_description + ontologies = database.datasets.get_feature_ontologies(dataset_id, expdb) + for feature in features: + feature.ontology = ontologies.get(feature.index) + for feature in [f for f in features if f.data_type == FeatureType.NOMINAL]: feature.nominal_values = database.datasets.get_feature_values( dataset_id, diff --git a/src/schemas/datasets/openml.py b/src/schemas/datasets/openml.py index 8edb373c..b1f51574 100644 --- a/src/schemas/datasets/openml.py +++ b/src/schemas/datasets/openml.py @@ -40,6 +40,7 @@ class Feature(BaseModel): index: int name: str data_type: FeatureType + ontology: list[str] | None = None is_target: bool is_ignore: bool is_row_identifier: bool