diff --git a/pyproject.toml b/pyproject.toml index d3b013c7..89c0e62c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,6 +20,8 @@ dependencies = [ "mysqlclient", "python_dotenv", "xmltodict", + "liac-arff", + "pyarrow", ] [project.optional-dependencies] diff --git a/src/core/feature_analysis.py b/src/core/feature_analysis.py new file mode 100644 index 00000000..25985e85 --- /dev/null +++ b/src/core/feature_analysis.py @@ -0,0 +1,163 @@ +from collections.abc import Iterable +from typing import IO + +import arff +import pyarrow as pa +import pyarrow.parquet as pq + +from schemas.datasets.openml import Feature, FeatureType + + +def analyze_arff( + arff_stream: IO[str], + target_features: Iterable[str] | None = None, + ignore_features: Iterable[str] | None = None, + row_id_features: Iterable[str] | None = None, +) -> list[Feature]: + """Analyze an ARFF file and return a list of Feature objects.""" + dataset = arff.load(arff_stream) + attributes = dataset["attributes"] + data = dataset["data"] + + target_features = set(target_features or []) + ignore_features = set(ignore_features or []) + row_id_features = set(row_id_features or []) + + features = [] + for i, (name, type_info) in enumerate(attributes): + if isinstance(type_info, list): + data_type = FeatureType.NOMINAL + nominal_values = type_info + elif type_info.upper() in ("NUMERIC", "REAL", "INTEGER"): + data_type = FeatureType.NUMERIC + nominal_values = None + elif type_info.upper() == "STRING": + data_type = FeatureType.STRING + nominal_values = None + else: + # Fallback or handle other types if necessary + data_type = FeatureType.STRING + nominal_values = None + + # Count missing values + missing_count = 0 + if data: + for row in data: + # In liac-arff, data can be a list of lists or a list of dictionaries (for sparse) + if isinstance(row, dict): + # Sparse format: only present values are in the dict + if i not in row: + # In sparse ARFF, if an index is missing from the dict, + # it means it has the default value, which is usually 0. + # sparse ARFF in liac-arff: + # {index: value, ...} + # If it's missing, it's 0. + # Missing values are represented as None in the dict if explicitly present. + # OpenML's sparse ARFF uses {index value, ...} + # and missing values are simply not there if they are 0, + # but if they are really missing they should be there as '?' + # which liac-arff converts to None. + pass + elif row[i] is None: + missing_count += 1 + elif row[i] is None: + missing_count += 1 + + features.append( + Feature( + index=i, + name=name, + data_type=data_type, + is_target=name in target_features, + is_ignore=name in ignore_features, + is_row_identifier=name in row_id_features, + number_of_missing_values=missing_count, + nominal_values=nominal_values, + ), + ) + return features + + +def analyze_parquet( + source: str | IO[bytes], + target_features: Iterable[str] | None = None, + ignore_features: Iterable[str] | None = None, + row_id_features: Iterable[str] | None = None, +) -> list[Feature]: + """Analyze a Parquet file and return a list of Feature objects.""" + table = pq.read_table(source) + schema = table.schema + + target_features = set(target_features or []) + ignore_features = set(ignore_features or []) + row_id_features = set(row_id_features or []) + + features = [] + for i, field in enumerate(schema): + name = field.name + pa_type = field.type + + # Determine data_type and nominal_values + nominal_values = None + if pa.types.is_floating(pa_type) or pa.types.is_integer(pa_type): + data_type = FeatureType.NUMERIC + elif pa.types.is_dictionary(pa_type): + data_type = FeatureType.NOMINAL + # Extract nominal values from dictionary + # We need to look at the data to get the dictionary values + column_data = table.column(i) + # A column can have multiple chunks + unique_values = set() + for chunk in column_data.chunks: + dictionary = chunk.dictionary + for val in dictionary: + unique_values.add(val.as_py()) + nominal_values = sorted(list(unique_values)) + elif pa.types.is_string(pa_type) or pa.types.is_boolean(pa_type): + # For Parquet, strings might be nominal if they don't have a dictionary + # We needed to "Extract unique values from the data" for nominals in non-ARFF + # In OpenML, if it's used for classification, it's nominal. + # If we don't know, we might have to guess or treat all strings as nominal if they have + # few unique values. + + # For Parquet, let's assume if it's not numeric, it's nominal for now, + # as that's common in ML datasets, unless it's explicitly string. + + # If it's boolean, it's definitely nominal [False, True]. + if pa.types.is_boolean(pa_type): + data_type = FeatureType.NOMINAL + nominal_values = ["false", "true"] + else: + # For string, let's extract unique values and see. + column_data = table.column(i) + unique_values = set() + # For efficiency, we might not want to scan everything if it's huge + for chunk in column_data.chunks: + for val in chunk.unique(): + v = val.as_py() + if v is not None: + unique_values.add(str(v)) + + # OpenML usually has a threshold, but let's just call it nominal if it's string + data_type = FeatureType.NOMINAL + nominal_values = sorted(list(unique_values)) + else: + data_type = FeatureType.STRING + nominal_values = None + + # Count missing values + missing_count = table.column(i).null_count + + features.append( + Feature( + index=i, + name=name, + data_type=data_type, + is_target=name in target_features, + is_ignore=name in ignore_features, + is_row_identifier=name in row_id_features, + number_of_missing_values=missing_count, + nominal_values=nominal_values, + ), + ) + return features diff --git a/tests/core/test_feature_analysis.py b/tests/core/test_feature_analysis.py new file mode 100644 index 00000000..488047d7 --- /dev/null +++ b/tests/core/test_feature_analysis.py @@ -0,0 +1,138 @@ +from io import BytesIO, StringIO + +import pyarrow as pa +import pyarrow.parquet as pq + +from core.feature_analysis import analyze_arff, analyze_parquet +from schemas.datasets.openml import FeatureType + + +def test_analyze_arff_dense(): + arff_data = """@RELATION test +@ATTRIBUTE a NUMERIC +@ATTRIBUTE b {x,y} +@ATTRIBUTE c STRING +@DATA +1,x,hello +2,y,world +?,? ,? +""" + features = analyze_arff(StringIO(arff_data)) + assert len(features) == 3 + + assert features[0].name == "a" + assert features[0].data_type == FeatureType.NUMERIC + assert features[0].number_of_missing_values == 1 + assert features[0].nominal_values is None + + assert features[1].name == "b" + assert features[1].data_type == FeatureType.NOMINAL + assert features[1].nominal_values == ["x", "y"] + assert features[1].number_of_missing_values == 1 + + assert features[2].name == "c" + assert features[2].data_type == FeatureType.STRING + assert features[2].number_of_missing_values == 1 + + +def test_analyze_arff_sparse(): + arff_data = """@RELATION test +@ATTRIBUTE a NUMERIC +@ATTRIBUTE b NUMERIC +@ATTRIBUTE c {X,Y} +@DATA +{0 1, 2 X} +{1 5} +{0 ?, 2 ?} +""" + features = analyze_arff(StringIO(arff_data)) + assert len(features) == 3 + + # index 0: 1, missing(0), ? -> 1 missing + assert features[0].name == "a" + assert features[0].number_of_missing_values == 1 + + # index 1: missing(0), 5, missing(0) -> 0 missing + assert features[1].name == "b" + assert features[1].number_of_missing_values == 0 + + # index 2: X, missing(None?), ? + # Row 1: {1 5} -> index 2 is missing. In sparse ARFF, if it's missing it's the 0-th element for nominal. + assert features[2].name == "c" + assert features[2].number_of_missing_values == 1 # Only from row 2 {0 ?, 2 ?} + + +def test_analyze_arff_sparse_all_missing(): + arff_data = """@RELATION sparse +@ATTRIBUTE a NUMERIC +@DATA +{0 ?} +? +{} +""" + # row 0: ? -> missing + # row 1: ? -> missing + # row 2: {} -> index 0 is missing from dict -> default (0) -> NOT missing + features = analyze_arff(StringIO(arff_data)) + assert features[0].number_of_missing_values == 2 + + +def test_analyze_arff_metadata(): + arff_data = """@RELATION test +@ATTRIBUTE a NUMERIC +@ATTRIBUTE b NUMERIC +@ATTRIBUTE c NUMERIC +@DATA +1,2,3 +""" + features = analyze_arff( + StringIO(arff_data), target_features=["c"], ignore_features=["b"], row_id_features=["a"] + ) + assert features[0].is_row_identifier is True + assert features[1].is_ignore is True + assert features[2].is_target is True + assert features[0].is_target is False + + +def test_analyze_parquet(): + data = [ + pa.array([1, 2, None]), + pa.array(["cat", "dog", "cat"]), + pa.array([True, False, None]), + pa.array(["v1", "v2", "v3"], type=pa.dictionary(pa.int8(), pa.string())), + ] + schema = pa.schema( + [ + ("f1", pa.int64()), + ("f2", pa.string()), + ("f3", pa.bool_()), + ("f4", pa.dictionary(pa.int8(), pa.string())), + ] + ) + table = pa.Table.from_arrays(data, schema=schema) + + buf = BytesIO() + pq.write_table(table, buf) + buf.seek(0) + + features = analyze_parquet(buf, target_features=["f3"]) + + assert len(features) == 4 + + assert features[0].name == "f1" + assert features[0].data_type == FeatureType.NUMERIC + assert features[0].number_of_missing_values == 1 + + assert features[1].name == "f2" + assert features[1].data_type == FeatureType.NOMINAL + assert sorted(features[1].nominal_values) == ["cat", "dog"] + + assert features[2].name == "f3" + assert features[2].data_type == FeatureType.NOMINAL + assert features[2].is_target is True + assert features[2].number_of_missing_values == 1 + assert features[2].nominal_values == ["false", "true"] + + assert features[3].name == "f4" + assert features[3].data_type == FeatureType.NOMINAL + assert sorted(features[3].nominal_values) == ["v1", "v2", "v3"]